list.of.packages <- c("tidyverse", "reshape2", "plotly", "vcfR", "SNPRelate", "network", "janitor", "vcfR") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Load sample info and create dataframes for each stage

load(file="../raw-data/sample.info")

sample.info.adult <- sample.info %>%
          select(sample, stage, population, pCO2.parent, temp.parent) %>% filter(stage=="adult") %>% 
 mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-"))) %>% droplevels()

sample.info.larvae <- sample.info %>%
          select(sample, stage, population, pCO2.parent) %>% filter(stage=="larvae" & sample!=571) %>% 
 mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
        sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()

sample.info.juvenile <- sample.info %>%
          select(sample, stage, population, pCO2.parent) %>% filter(stage=="juvenile") %>% 
 mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
        sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()

ADULTS, COMPARISON AMONG PCO2 TREATMENTS WITHIN POPULATIONS

Use gatk to hard filter variants for depth & quality

/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:23:45.707 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:23:46 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:23:46.203 INFO  VariantFiltration - ------------------------------------------------------------
## 20:23:46.203 INFO  VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:23:46.203 INFO  VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:23:46.203 INFO  VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:23:46.203 INFO  VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:23:46.204 INFO  VariantFiltration - Start Date/Time: April 20, 2021 at 8:23:45 PM PDT
## 20:23:46.204 INFO  VariantFiltration - ------------------------------------------------------------
## 20:23:46.204 INFO  VariantFiltration - ------------------------------------------------------------
## 20:23:46.204 INFO  VariantFiltration - HTSJDK Version: 2.23.0
## 20:23:46.204 INFO  VariantFiltration - Picard Version: 2.23.3
## 20:23:46.204 INFO  VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:23:46.204 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:23:46.204 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:23:46.204 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:23:46.204 INFO  VariantFiltration - Deflater: IntelDeflater
## 20:23:46.204 INFO  VariantFiltration - Inflater: IntelInflater
## 20:23:46.205 INFO  VariantFiltration - GCS max retries/reopens: 20
## 20:23:46.205 INFO  VariantFiltration - Requester pays: disabled
## 20:23:46.205 INFO  VariantFiltration - Initializing engine
## 20:23:46.333 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz
## 20:23:46.445 INFO  VariantFiltration - Done initializing engine
## 20:23:46.508 INFO  ProgressMeter - Starting traversal
## 20:23:46.508 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:23:56.520 INFO  ProgressMeter - Contig41068_47193:2790524              0.2                557000        3337994.4
## 20:24:02.134 INFO  ProgressMeter - Contig98527_104152:29794069              0.3                924617        3550302.1
## 20:24:02.135 INFO  ProgressMeter - Traversal complete. Processed 924617 total variants in 0.3 minutes.
## 20:24:02.201 INFO  VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:24:02 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.28 minutes.
## Runtime.totalMemory()=322961408

Select only SNPs that pass filtering

/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## 20:24:04.169 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:24:04 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:24:04.431 INFO  SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO  SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:24:04.432 INFO  SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:24:04.432 INFO  SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:24:04.432 INFO  SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:24:04.432 INFO  SelectVariants - Start Date/Time: April 20, 2021 at 8:24:04 PM PDT
## 20:24:04.432 INFO  SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO  SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO  SelectVariants - HTSJDK Version: 2.23.0
## 20:24:04.432 INFO  SelectVariants - Picard Version: 2.23.3
## 20:24:04.433 INFO  SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:24:04.433 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:24:04.433 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:24:04.433 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:24:04.433 INFO  SelectVariants - Deflater: IntelDeflater
## 20:24:04.433 INFO  SelectVariants - Inflater: IntelInflater
## 20:24:04.433 INFO  SelectVariants - GCS max retries/reopens: 20
## 20:24:04.433 INFO  SelectVariants - Requester pays: disabled
## 20:24:04.433 INFO  SelectVariants - Initializing engine
## 20:24:04.558 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz
## 20:24:04.652 INFO  SelectVariants - Done initializing engine
## 20:24:04.683 INFO  ProgressMeter - Starting traversal
## 20:24:04.683 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:24:14.723 INFO  ProgressMeter - Contig11179_21262:74087300              0.2                167000         998008.0
## 20:24:24.752 INFO  ProgressMeter - Contig28136_34745:69884028              0.3                375000        1121188.0
## 20:24:34.768 INFO  ProgressMeter - Contig53181_59083:51903313              0.5                589000        1174671.8
## 20:24:42.082 INFO  ProgressMeter - Contig98527_104152:29059358              0.6                747969        1199982.4
## 20:24:42.082 INFO  ProgressMeter - Traversal complete. Processed 747969 total variants in 0.6 minutes.
## 20:24:42.132 INFO  SelectVariants - Shutting down engine
## [April 20, 2021 at 8:24:42 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.63 minutes.
## Runtime.totalMemory()=387973120

Use SNPRelate to filter vcf files. Remove:

  • Select biallelic loci only
  • Loci with minor allele frequency < 5%
  • Select loci that are present across all samples (missing.rate=0)
#snpgdsClose(genofile.adult)
vcf.fn.adult <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.adult, "../qc-processing/gatk/genotypes-adult.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 53
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" ...
##  import 175243 variants.
## + genotype   { Bit2 53x175243, 2.2M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file '../qc-processing/gatk/genotypes-adult.gds' (3.1M)
##     # of fragments: 68
##     save to '../qc-processing/gatk/genotypes-adult.gds.tmp'
##     rename '../qc-processing/gatk/genotypes-adult.gds.tmp' (3.1M, reduced: 576B)
##     # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-adult.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds 
## The total number of samples: 53 
## The total number of SNPs: 175243 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Open gds and check out genofile contents

(genofile.adult <- snpgdsOpen("../qc-processing/gatk/genotypes-adult.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds (3.1M)
## +    [  ] *
## |--+ sample.id   { Str8 53 LZMA_ra(76.4%), 169B }
## |--+ snp.id   { Int32 175243 LZMA_ra(6.47%), 44.3K }
## |--+ snp.rs.id   { Str8 175243 LZMA_ra(0.10%), 181B }
## |--+ snp.position   { Int32 175243 LZMA_ra(40.8%), 279.2K }
## |--+ snp.chromosome   { Str8 175243 LZMA_ra(0.03%), 973B }
## |--+ snp.allele   { Str8 175243 LZMA_ra(11.9%), 81.7K }
## |--+ genotype   { Bit2 53x175243, 2.2M } *
## \--+ snp.annot   [  ]
##    |--+ qual   { Float32 175243 LZMA_ra(65.4%), 448.0K }
##    \--+ filter   { Str8 175243 LZMA_ra(0.03%), 285B }

Prune/Clean SNPs

  • Remove those in linkage-disequilibrium
  • Remove those with Minor Allele Frequency <1%
  • Remove those that are missing from any sample
snpset.adult <- snpgdsLDpruning(genofile.adult, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,670 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
##     # of samples: 53
##     # of SNPs: 573
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 0.23%, 50/21,863
## Chromosome Contig104153_109739: 0.13%, 3/2,360
## Chromosome Contig109741_116186: 0.27%, 5/1,861
## Chromosome Contig11179_21262: 0.29%, 46/15,668
## Chromosome Contig116188_123130: 0.11%, 2/1,794
## Chromosome Contig123131_129982: 0.22%, 4/1,787
## Chromosome Contig129983_136896: 0.22%, 4/1,798
## Chromosome Contig136897_143777: 0.16%, 3/1,887
## Chromosome Contig143780_150710: 0.06%, 1/1,746
## Chromosome Contig150711_166372: 0.40%, 6/1,485
## Chromosome Contig166374_181896: 0.21%, 4/1,875
## Chromosome Contig181898_197428: 0.20%, 3/1,516
## Chromosome Contig197431_213218: 0.06%, 1/1,777
## Chromosome Contig21263_28135: 0.29%, 53/18,170
## Chromosome Contig213221_398012: 0.38%, 7/1,819
## Chromosome Contig28136_34745: 0.26%, 41/15,929
## Chromosome Contig34748_41067: 0.40%, 54/13,582
## Chromosome Contig398123_696856: 0.29%, 5/1,721
## Chromosome Contig41068_47193: 0.25%, 28/11,130
## Chromosome Contig47194_53180: 0.31%, 28/9,167
## Chromosome Contig5314_11178: 0.17%, 9/5,362
## Chromosome Contig53181_59083: 0.22%, 17/7,588
## Chromosome Contig59084_64831: 0.22%, 14/6,364
## Chromosome Contig64832_70524: 0.28%, 16/5,633
## Chromosome Contig70525_76135: 0.28%, 13/4,689
## Chromosome Contig76136_81765: 0.17%, 7/4,040
## Chromosome Contig81766_87352: 0.24%, 10/4,163
## Chromosome Contig87353_92929: 0.29%, 8/2,773
## Chromosome Contig92931_98526: 0.22%, 6/2,712
## Chromosome Contig98527_104152: 0.27%, 8/2,984
## 456 markers are selected in total.
snpset.adult.id <- unlist(unname(snpset.adult))

For ADULT genotype analysis: ~2,950 SNPs remain after all filtering steps

Adult PCA

# PCA 
pca.adult <- snpgdsPCA(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 53
##     # of SNPs: 456
##     using 2 threads
##     # of principal components: 32
## PCA:    the sum of all selected genotypes (0,1,2) = 37200
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:24:49 2021    (internal increment: 19088)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:49 2021    Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:24:49 2021    Done.
pc.percent.adult <- pca.adult$varprop*100
head(round(pc.percent.adult, 2))
## [1] 9.47 6.28 5.59 4.66 4.53 4.11
tab.adult <- data.frame(sample.id = pca.adult$sample.id,
    EV1 = pca.adult$eigenvect[,1],    # the first eigenvector
    EV2 = pca.adult$eigenvect[,2],    # the second eigenvector
    EV3 = pca.adult$eigenvect[,3],    # the third eigenvector
    EV4 = pca.adult$eigenvect[,4],    # the fourth eigenvector
    EV5 = pca.adult$eigenvect[,5],    # the fifth eigenvector
    stringsAsFactors = FALSE)
tab.adult.annot <- left_join(tab.adult, sample.info.adult, by=c("sample.id"="sample")) %>% droplevels()
group.adult <- as.factor(tab.adult.annot$pop_code)

#ggplotly( 
    ggplot(tab.adult.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC2")#)

ggplotly( 
    ggplot(tab.adult.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC3"))
ggplotly( 
    ggplot(tab.adult.annot, aes(x=EV4, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC4"))
ggplotly( 
    ggplot(tab.adult.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC5"))

Adult cluster analysis

# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances, 
# and determine the groups by a permutation score. 
# Result= only 1 group (?)
ibs.hc.adult <- snpgdsHCluster(snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 53
##     # of SNPs: 456
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 37200
## Tue Apr 20 20:24:50 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:50 2021    Done.
rv.adult <- snpgdsCutTree(ibs.hc.adult)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.adult$dendrogram, leaflab="none", main="HapMap Phase II")

# Determine groups of individuals by population information
rv2.adult <- snpgdsCutTree(ibs.hc.adult, samp.group=as.factor(tab.adult.annot$pop_code))
## Create 6 groups.
plot(rv2.adult$dendrogram, leaflab="none", main="HapMap Phase II")
legend("bottom", legend=levels(group.adult), col=1:nlevels(group.adult), pch=19, ncol=4, cex=0.6, pt.cex=1)

Adult Allele frequencies

AF.adult.FB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.adult.FB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.adult.DB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.adult.DB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.adult.OB1.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.adult.OB1.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)

AF.adults <- rbind(do.call(cbind.data.frame, AF.adult.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.adult.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("ambient")), 
do.call(cbind.data.frame, AF.adult.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.adult.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.adult.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.adult.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("ambient")))

ggplot(AF.adults, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
  ggtitle(paste("SNP Allele Frequency, Adults by population & pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
    geom_text(data=AF.adults %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.

## Adult Fst Calculations

Note: snpgdsFst() returns 5 things:

  • sample IDs
  • SNP ids (i.e. indices)
  • “Fst” = weighted Fst estimate
  • “MeanFst” = average of Fst estimates across all SNPs
  • “FstSNP” = A vector of Fst for each SNP
# within FB (comparing pCO2 exposure): 
Fst.adult.FB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Fidalgo Bay")$sample.id, 
               population=as.factor(subset(tab.adult.annot, population=="Fidalgo Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 21 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 18
##     # of SNPs: 435
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.adult.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High:  0.01504"
hist(Fst.adult.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Adults", col="dodgerblue3")

# within DB (comparing pCO2 exposure): 
Fst.adult.DB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id, 
                        sample.id=subset(tab.adult.annot, population=="Dabob Bay")$sample.id, 
               population=as.factor(subset(tab.adult.annot, population=="Dabob Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 32 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 17
##     # of SNPs: 424
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (9), High (8)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.adult.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High:  0.01595"
hist(Fst.adult.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Adults", col="darkseagreen3")

# within OB1 (comparing pCO2 exposure): 
Fst.adult.OB1 <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id, 
                         sample.id=subset(tab.adult.annot, population=="Oyster Bay C1")$sample.id, 
               population=as.factor(subset(tab.adult.annot, population=="Oyster Bay C1")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 36 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 18
##     # of SNPs: 420
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.adult.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High:  -0.01404"
hist(Fst.adult.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Adults", col="salmon2")

Fst.adults <- rbind(do.call(cbind.data.frame, Fst.adult.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult")), 
do.call(cbind.data.frame, Fst.adult.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult")),
do.call(cbind.data.frame, Fst.adult.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"))) 

ggplot(Fst.adults, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
  ggtitle(paste("SNP Fst, Adults by pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
    geom_text(data=Fst.adults %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.65, label=round(mean, 3)))
## Warning: `fun.y` is deprecated. Use `fun` instead.

## Adult Identity-by-state

lbls <- paste("PC", 1:4, "\n", format(pc.percent.adult[1:4], digits=2), "%", sep="")
pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")

pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pop_code), labels=lbls, main="colors = pop & pCO2")

ibs.adult <- snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 53
##     # of SNPs: 456
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 37200
## Tue Apr 20 20:24:52 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:52 2021    Done.
pop.idx.adult <- order(tab.adult.annot$pop_code)

#par(xpd=TRUE)
#heatmap(ibs.adult$ibs, col=terrain.colors(16), labCol=TRUE, 
#        RowSideColors=c("black","red","green","blue")[tab.adult.annot$pop_code])
#legend(0,-.05, legend=levels(tab.adult.annot$pop_code), pch="o", col=1:nlevels(tab.adult.annot$pop_code), 
#       text.col=1:nlevels(tab.adult.annot$pop_code), cex=.75, ncol=4)

#Perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.adult <- cmdscale(1 - ibs.adult$ibs, k = 2)
x.adult <- loc.adult[, 1]; y.adult <- loc.adult[, 2]

plot(x.adult, y.adult, col=group.adult, xlab = "", ylab = "",
    main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
text(x.adult, y.adult+.01, labels=tab.adult.annot$sample.id, cex=0.7, font=2, col=as.integer(group.adult))
legend("bottom", legend=levels(group.adult), pch="o", text.col=1:nlevels(group.adult))

LARVAE, COMPARISON AMONG PCO2 TREATMENTS WITHIN POPULATIONS

Use gatk to hard filter variants for depth & quality

/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:24:53.986 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:24:54 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:24:54.385 INFO  VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO  VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:24:54.386 INFO  VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:24:54.386 INFO  VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:24:54.386 INFO  VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:24:54.386 INFO  VariantFiltration - Start Date/Time: April 20, 2021 at 8:24:53 PM PDT
## 20:24:54.386 INFO  VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO  VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO  VariantFiltration - HTSJDK Version: 2.23.0
## 20:24:54.386 INFO  VariantFiltration - Picard Version: 2.23.3
## 20:24:54.386 INFO  VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:24:54.386 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:24:54.387 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:24:54.387 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:24:54.387 INFO  VariantFiltration - Deflater: IntelDeflater
## 20:24:54.387 INFO  VariantFiltration - Inflater: IntelInflater
## 20:24:54.387 INFO  VariantFiltration - GCS max retries/reopens: 20
## 20:24:54.387 INFO  VariantFiltration - Requester pays: disabled
## 20:24:54.387 INFO  VariantFiltration - Initializing engine
## 20:24:54.499 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz
## 20:24:54.577 INFO  VariantFiltration - Done initializing engine
## 20:24:54.640 INFO  ProgressMeter - Starting traversal
## 20:24:54.640 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:25:04.650 INFO  ProgressMeter - Contig64832_70524:46356988              0.2                485000        2907092.9
## 20:25:05.885 INFO  ProgressMeter - Contig98527_104152:30619157              0.2                555097        2961833.7
## 20:25:05.885 INFO  ProgressMeter - Traversal complete. Processed 555097 total variants in 0.2 minutes.
## 20:25:05.947 INFO  VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:25:05 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.20 minutes.
## Runtime.totalMemory()=387973120

Select only SNPs that pass filtering

/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## 20:25:07.808 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:25:08 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:25:08.075 INFO  SelectVariants - ------------------------------------------------------------
## 20:25:08.075 INFO  SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:25:08.075 INFO  SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:25:08.075 INFO  SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:25:08.075 INFO  SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:25:08.075 INFO  SelectVariants - Start Date/Time: April 20, 2021 at 8:25:07 PM PDT
## 20:25:08.075 INFO  SelectVariants - ------------------------------------------------------------
## 20:25:08.075 INFO  SelectVariants - ------------------------------------------------------------
## 20:25:08.076 INFO  SelectVariants - HTSJDK Version: 2.23.0
## 20:25:08.076 INFO  SelectVariants - Picard Version: 2.23.3
## 20:25:08.076 INFO  SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:25:08.076 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:25:08.076 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:25:08.076 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:25:08.076 INFO  SelectVariants - Deflater: IntelDeflater
## 20:25:08.076 INFO  SelectVariants - Inflater: IntelInflater
## 20:25:08.076 INFO  SelectVariants - GCS max retries/reopens: 20
## 20:25:08.076 INFO  SelectVariants - Requester pays: disabled
## 20:25:08.077 INFO  SelectVariants - Initializing engine
## 20:25:08.217 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz
## 20:25:08.290 INFO  SelectVariants - Done initializing engine
## 20:25:08.324 INFO  ProgressMeter - Starting traversal
## 20:25:08.325 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:25:18.413 INFO  ProgressMeter - Contig11179_21262:51558538              0.2                 90000         535342.5
## 20:25:28.423 INFO  ProgressMeter - Contig28136_34745:55217958              0.3                220000         656781.8
## 20:25:38.432 INFO  ProgressMeter - Contig59084_64831:51489531              0.5                370000         737370.0
## 20:25:43.054 INFO  ProgressMeter - Contig98527_104152:27381330              0.6                439968         760116.3
## 20:25:43.054 INFO  ProgressMeter - Traversal complete. Processed 439968 total variants in 0.6 minutes.
## 20:25:43.102 INFO  SelectVariants - Shutting down engine
## [April 20, 2021 at 8:25:43 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.59 minutes.
## Runtime.totalMemory()=322961408

Use SNPRelate to filter vcf files. Remove:

  • Select biallelic loci only
  • Loci with minor allele frequency < 0.3 (?)
  • Select loci that are present across all samples (missing.rate=0)
#snpgdsClose(genofile.larvae)
vcf.fn.larvae <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.larvae, "../qc-processing/gatk/genotypes-larvae.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 76
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" ...
##  import 128222 variants.
## + genotype   { Bit2 76x128222, 2.3M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file '../qc-processing/gatk/genotypes-larvae.gds' (2.9M)
##     # of fragments: 59
##     save to '../qc-processing/gatk/genotypes-larvae.gds.tmp'
##     rename '../qc-processing/gatk/genotypes-larvae.gds.tmp' (2.9M, reduced: 468B)
##     # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-larvae.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds 
## The total number of samples: 76 
## The total number of SNPs: 128222 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Open gds and check out genofile contents

(genofile.larvae <- snpgdsOpen("../qc-processing/gatk/genotypes-larvae.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds (2.9M)
## +    [  ] *
## |--+ sample.id   { Str8 76 LZMA_ra(63.8%), 201B }
## |--+ snp.id   { Int32 128222 LZMA_ra(6.56%), 32.8K }
## |--+ snp.rs.id   { Str8 128222 LZMA_ra(0.13%), 173B }
## |--+ snp.position   { Int32 128222 LZMA_ra(42.0%), 210.2K }
## |--+ snp.chromosome   { Str8 128222 LZMA_ra(0.04%), 857B }
## |--+ snp.allele   { Str8 128222 LZMA_ra(12.0%), 60.2K }
## |--+ genotype   { Bit2 76x128222, 2.3M } *
## \--+ snp.annot   [  ]
##    |--+ qual   { Float32 128222 LZMA_ra(65.7%), 328.9K }
##    \--+ filter   { Str8 128222 LZMA_ra(0.04%), 253B }

Prune/Clean SNPs

  • Remove those in linkage-disequilibrium
  • Remove those with Minor Allele Frequency <1%
  • Remove those that are missing from any sample
snpset.larvae <- snpgdsLDpruning(genofile.larvae, maf=.10, missing.rate=0.10, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 128,154 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0.1)
##     # of samples: 76
##     # of SNPs: 68
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 0.09%, 14/15,947
## Chromosome Contig104153_109739: 0.06%, 1/1,625
## Chromosome Contig11179_21262: 0.02%, 3/12,305
## Chromosome Contig116188_123130: 0.08%, 1/1,207
## Chromosome Contig129983_136896: 0.08%, 1/1,222
## Chromosome Contig136897_143777: 0.15%, 2/1,321
## Chromosome Contig143780_150710: 0.17%, 2/1,179
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig21263_28135: 0.04%, 5/13,677
## Chromosome Contig28136_34745: 0.03%, 4/12,701
## Chromosome Contig34748_41067: 0.06%, 6/10,383
## Chromosome Contig398123_696856: 0.10%, 1/992
## Chromosome Contig41068_47193: 0.09%, 7/7,913
## Chromosome Contig47194_53180: 0.04%, 3/6,968
## Chromosome Contig5314_11178: 0.03%, 1/3,691
## Chromosome Contig53181_59083: 0.05%, 3/5,690
## Chromosome Contig59084_64831: 0.09%, 4/4,432
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.05%, 1/1,952
## Chromosome Contig92931_98526: 0.05%, 1/1,900
## Chromosome Contig98527_104152: 0.05%, 1/1,979
## 65 markers are selected in total.
snpset.larvae.id <- unlist(unname(snpset.larvae))

For LARVAL analysis, ~360 SNPs remain after all filtering steps

Larval PCA

# PCA 
pca.larvae <- snpgdsPCA(genofile.larvae, snp.id=snpset.larvae.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 76
##     # of SNPs: 65
##     using 2 threads
##     # of principal components: 32
## PCA:    the sum of all selected genotypes (0,1,2) = 7538
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:25:49 2021    (internal increment: 13312)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:49 2021    Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:25:49 2021    Done.
pc.percent.larvae <- pca.larvae$varprop*100
head(round(pc.percent.larvae, 2))
## [1] 10.00  7.72  5.61  5.10  4.57  4.19
tab.larvae <- data.frame(sample.id = pca.larvae$sample.id,
    EV1 = pca.larvae$eigenvect[,1],    # the first eigenvector
    EV2 = pca.larvae$eigenvect[,2],    # the second eigenvector
    EV3 = pca.larvae$eigenvect[,3],    # the third eigenvector
    EV4 = pca.larvae$eigenvect[,4],    # the fourth eigenvector
    EV5 = pca.larvae$eigenvect[,5],    # the fourth eigenvector
    stringsAsFactors = FALSE)
tab.larvae.annot <- left_join(tab.larvae, sample.info.larvae, by=c("sample.id"="sample")) %>% droplevels()
group.larvae <- as.factor(tab.larvae.annot$pop_code)

lbls <- paste("PC", 1:5, "\n", format(pc.percent.larvae[1:5], digits=2), "%", sep="")
pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")

pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pop_code), labels=lbls, main="colors = pop & pCO2")

Interactive PCA biplots

# PC1 x PC2
# uncomment out the ggplotly lines to interact
#ggplotly( 
    ggplot(tab.larvae.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\nLarvae Genetic Structure (pooled batches)")#)

#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly( 
    ggplot(tab.larvae.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\nLarvae Genetic Structure (pooled batches)"))
# PC1 x PC4
ggplotly( 
    ggplot(tab.larvae.annot, aes(x=EV4, y=EV2, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC4\nLarvae Gene Expression (pooled batches)"))
# PC1 x PC5
ggplotly( 
    ggplot(tab.larvae.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC5\nLarvae Gene Expression (pooled batches)"))

Larval cluster analysis

# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances, 
# and determine the groups by a permutation score. 
ibs.hc.larvae <- snpgdsHCluster(snpgdsIBS(genofile.larvae, num.thread=2,  snp.id=snpset.larvae.id, autosome.only=FALSE, 
                                         sample.id=setdiff(tab.larvae.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 75
##     # of SNPs: 65
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 7516
## Tue Apr 20 20:25:50 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:50 2021    Done.
rv.larvae <- snpgdsCutTree(ibs.hc.larvae)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.larvae$dendrogram, leaflab="none", main="HapMap Phase II")

# Determine groups of individuals by population information
group.larvae <- as.factor(tab.larvae.annot$pop_code)
rv2.larvae <- snpgdsCutTree(ibs.hc.larvae, samp.group=as.factor(subset(tab.larvae.annot, sample.id!="044")$pop_code))
## Create 8 groups.
plot(rv2.larvae$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.larvae), col=1:nlevels(group.larvae), pch=19, ncol=4, cex=0.6, pt.cex=1)

## Larval Allele frequencies

AF.larvae.FB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.larvae.FB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.larvae.DB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.larvae.DB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.larvae.OB1.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.larvae.OB1.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.larvae.OB2.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.larvae.OB2.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)

AF.larvae <- rbind(do.call(cbind.data.frame, AF.larvae.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.larvae.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")), 
do.call(cbind.data.frame, AF.larvae.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.larvae.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.larvae.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB2.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.larvae.OB2.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("ambient")))

ggplot(AF.larvae, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population, ncol=4) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
  ggtitle(paste("SNP Allele Frequency, larvaes by population & pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
    geom_text(data=AF.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2))) + theme(legend.position = "none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.

Larval Fst Calculations

# within FB (comparing pCO2 exposure): 
Fst.larvae.FB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay")$sample.id, 
               population=as.factor(subset(tab.larvae.annot, population=="Fidalgo Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 7 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 17
##     # of SNPs: 58
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (7), High (10)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.larvae.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High:  0.00852"
hist(Fst.larvae.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Larvae", col="dodgerblue3", xlim = c(-.15, 0.8))

# within DB (comparing pCO2 exposure): 
Fst.larvae.DB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id, 
                        sample.id=subset(tab.larvae.annot, population=="Dabob Bay")$sample.id, 
               population=as.factor(subset(tab.larvae.annot, population=="Dabob Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 18 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 12
##     # of SNPs: 47
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (5), High (7)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.larvae.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High:  0.01232"
hist(Fst.larvae.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Larvae", col="darkseagreen3", xlim = c(-.2, .8))

# within OB1 (comparing pCO2 exposure): 
Fst.larvae.OB1 <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id, 
                         sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1")$sample.id, 
               population=as.factor(subset(tab.larvae.annot, population=="Oyster Bay C1")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 2 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 32
##     # of SNPs: 63
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (16), High (16)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.larvae.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High:  0.00041"
hist(Fst.larvae.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Larvae", col="salmon2", xlim = c(-.2, 0.8))

Fst.larvae <- rbind(do.call(cbind.data.frame, Fst.larvae.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae")), 
do.call(cbind.data.frame, Fst.larvae.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae")),
do.call(cbind.data.frame, Fst.larvae.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"))) 

ggplot(Fst.larvae, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
  ggtitle(paste("SNP Fst, Larvae by pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
    geom_text(data=Fst.larvae %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.78, label=round(mean, 3))) +
  theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.

Larval Identify-by-state

ibs.larvae <- snpgdsIBS(genofile.larvae, num.thread=2, snp.id=snpset.larvae.id, autosome.only=FALSE, sample.id=setdiff(tab.larvae.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 75
##     # of SNPs: 65
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 7516
## Tue Apr 20 20:25:51 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:51 2021    Done.
pop.idx.larvae <- order(tab.larvae.annot$pop_code)

#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.larvae <- cmdscale(1 - ibs.larvae$ibs, k = 2)
x.larvae <- loc.larvae[, 1]; y.larvae <- loc.larvae[, 2]
a <- as.factor(subset(tab.larvae.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.larvae.annot, sample.id!="044")$pCO2.parent))

par(xpd=TRUE)
plot(x.larvae, y.larvae, col=group.larvae, xlab = "", ylab = "",
    main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
#text(x.larvae, y.larvae+.01, labels=tab.larvae.annot$sample.id, cex=0.7, font=2, col=as.integer(group.larvae))
legend("bottomright", inset=c(-.065,0), legend=levels(group.larvae), cex=.6, pch="o", text.col=1:nlevels(group.larvae))

# Option to plot color coded by pop, symbol by pCO2  
#plot(x.larvae, y.larvae, col=a, pch=b, xlab = "", ylab = "",
#    main = "Multidimensional Scaling Analysis (IBS)", cex=1)

JUVENILES, COMPARISON AMONG PCO2 TREATMENTS WITHIN POPULATIONS

Use gatk to hard filter variants for depth & quality

/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:25:53.658 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:25:53 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:25:53.829 INFO  VariantFiltration - ------------------------------------------------------------
## 20:25:53.829 INFO  VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:25:53.829 INFO  VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:25:53.829 INFO  VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:25:53.829 INFO  VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:25:53.829 INFO  VariantFiltration - Start Date/Time: April 20, 2021 at 8:25:53 PM PDT
## 20:25:53.830 INFO  VariantFiltration - ------------------------------------------------------------
## 20:25:53.830 INFO  VariantFiltration - ------------------------------------------------------------
## 20:25:53.830 INFO  VariantFiltration - HTSJDK Version: 2.23.0
## 20:25:53.830 INFO  VariantFiltration - Picard Version: 2.23.3
## 20:25:53.830 INFO  VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:25:53.830 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:25:53.830 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:25:53.830 INFO  VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:25:53.830 INFO  VariantFiltration - Deflater: IntelDeflater
## 20:25:53.830 INFO  VariantFiltration - Inflater: IntelInflater
## 20:25:53.830 INFO  VariantFiltration - GCS max retries/reopens: 20
## 20:25:53.831 INFO  VariantFiltration - Requester pays: disabled
## 20:25:53.831 INFO  VariantFiltration - Initializing engine
## 20:25:53.930 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz
## 20:25:53.998 INFO  VariantFiltration - Done initializing engine
## 20:25:54.054 INFO  ProgressMeter - Starting traversal
## 20:25:54.055 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:25:59.122 INFO  ProgressMeter - Contig98527_104152:26467437              0.1                280822        3325961.3
## 20:25:59.122 INFO  ProgressMeter - Traversal complete. Processed 280822 total variants in 0.1 minutes.
## 20:25:59.185 INFO  VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:25:59 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.09 minutes.
## Runtime.totalMemory()=322961408

Select only SNPs that pass filtering

/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
##     java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## 20:26:01.519 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:26:01 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:26:01.770 INFO  SelectVariants - ------------------------------------------------------------
## 20:26:01.770 INFO  SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:26:01.770 INFO  SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:26:01.770 INFO  SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:26:01.770 INFO  SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:26:01.770 INFO  SelectVariants - Start Date/Time: April 20, 2021 at 8:26:01 PM PDT
## 20:26:01.770 INFO  SelectVariants - ------------------------------------------------------------
## 20:26:01.770 INFO  SelectVariants - ------------------------------------------------------------
## 20:26:01.771 INFO  SelectVariants - HTSJDK Version: 2.23.0
## 20:26:01.771 INFO  SelectVariants - Picard Version: 2.23.3
## 20:26:01.771 INFO  SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:26:01.771 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:26:01.771 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:26:01.771 INFO  SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:26:01.772 INFO  SelectVariants - Deflater: IntelDeflater
## 20:26:01.772 INFO  SelectVariants - Inflater: IntelInflater
## 20:26:01.772 INFO  SelectVariants - GCS max retries/reopens: 20
## 20:26:01.772 INFO  SelectVariants - Requester pays: disabled
## 20:26:01.772 INFO  SelectVariants - Initializing engine
## 20:26:01.918 INFO  FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz
## 20:26:01.992 INFO  SelectVariants - Done initializing engine
## 20:26:02.032 INFO  ProgressMeter - Starting traversal
## 20:26:02.032 INFO  ProgressMeter -        Current Locus  Elapsed Minutes    Variants Processed  Variants/Minute
## 20:26:09.533 INFO  ProgressMeter - Contig98527_104152:24952552              0.1                222831        1782648.0
## 20:26:09.533 INFO  ProgressMeter - Traversal complete. Processed 222831 total variants in 0.1 minutes.
## 20:26:09.580 INFO  SelectVariants - Shutting down engine
## [April 20, 2021 at 8:26:09 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.14 minutes.
## Runtime.totalMemory()=322961408

Use SNPRelate to filter vcf files. Remove:

  • Select biallelic loci only
  • Loci with minor allele frequency < 0.3 (?)
  • Select loci that are present across all samples (missing.rate=0)
#snpgdsClose(genofile.juvenile)
vcf.fn.juvenile <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.juvenile, "../qc-processing/gatk/genotypes-juvenile.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 16
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" ...
##  import 50608 variants.
## + genotype   { Bit2 16x50608, 197.7K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file '../qc-processing/gatk/genotypes-juvenile.gds' (459.9K)
##     # of fragments: 49
##     save to '../qc-processing/gatk/genotypes-juvenile.gds.tmp'
##     rename '../qc-processing/gatk/genotypes-juvenile.gds.tmp' (459.5K, reduced: 348B)
##     # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-juvenile.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds 
## The total number of samples: 16 
## The total number of SNPs: 50608 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Open gds and check out genofile contents

(genofile.juvenile <- snpgdsOpen("../qc-processing/gatk/genotypes-juvenile.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds (459.5K)
## +    [  ] *
## |--+ sample.id   { Str8 16 LZMA_ra(178.1%), 121B }
## |--+ snp.id   { Int32 50608 LZMA_ra(7.08%), 14.0K }
## |--+ snp.rs.id   { Str8 50608 LZMA_ra(0.30%), 157B }
## |--+ snp.position   { Int32 50608 LZMA_ra(43.1%), 85.1K }
## |--+ snp.chromosome   { Str8 50608 LZMA_ra(0.07%), 649B }
## |--+ snp.allele   { Str8 50608 LZMA_ra(12.3%), 24.2K }
## |--+ genotype   { Bit2 16x50608, 197.7K } *
## \--+ snp.annot   [  ]
##    |--+ qual   { Float32 50608 LZMA_ra(68.5%), 135.4K }
##    \--+ filter   { Str8 50608 LZMA_ra(0.08%), 197B }

Prune/Clean SNPs

  • Remove those in linkage-disequilibrium
  • Remove those with Minor Allele Frequency <1%
  • Remove those that are missing from any sample
snpset.juvenile <- snpgdsLDpruning(genofile.juvenile, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 41,649 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
##     # of samples: 16
##     # of SNPs: 8,959
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 4.90%, 281/5,731
## Chromosome Contig104153_109739: 5.33%, 40/750
## Chromosome Contig109741_116186: 5.22%, 35/670
## Chromosome Contig11179_21262: 4.22%, 190/4,498
## Chromosome Contig116188_123130: 6.21%, 36/580
## Chromosome Contig123131_129982: 5.21%, 22/422
## Chromosome Contig129983_136896: 6.16%, 33/536
## Chromosome Contig136897_143777: 5.40%, 36/667
## Chromosome Contig143780_150710: 6.58%, 32/486
## Chromosome Contig150711_166372: 6.15%, 28/455
## Chromosome Contig166374_181896: 6.36%, 35/550
## Chromosome Contig181898_197428: 4.96%, 21/423
## Chromosome Contig197431_213218: 4.96%, 28/564
## Chromosome Contig21263_28135: 4.55%, 227/4,994
## Chromosome Contig213221_398012: 6.67%, 34/510
## Chromosome Contig28136_34745: 4.36%, 208/4,772
## Chromosome Contig34748_41067: 5.00%, 191/3,817
## Chromosome Contig398123_696856: 5.88%, 28/476
## Chromosome Contig41068_47193: 4.97%, 159/3,201
## Chromosome Contig47194_53180: 5.18%, 146/2,819
## Chromosome Contig5314_11178: 4.95%, 70/1,414
## Chromosome Contig53181_59083: 5.17%, 131/2,534
## Chromosome Contig59084_64831: 5.86%, 105/1,792
## Chromosome Contig64832_70524: 5.38%, 89/1,654
## Chromosome Contig70525_76135: 5.71%, 76/1,331
## Chromosome Contig76136_81765: 5.87%, 78/1,329
## Chromosome Contig81766_87352: 5.99%, 72/1,203
## Chromosome Contig87353_92929: 6.24%, 54/866
## Chromosome Contig92931_98526: 5.94%, 47/791
## Chromosome Contig98527_104152: 6.99%, 54/773
## 2,586 markers are selected in total.
snpset.juvenile.id <- unlist(unname(snpset.juvenile))

For JUVENILE analysis, ~2,600 SNPs remain after all filtering steps

Juvenile PCA

# PCA 
pca.juvenile <- snpgdsPCA(genofile.juvenile, snp.id=snpset.juvenile.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 16
##     # of SNPs: 2,586
##     using 2 threads
##     # of principal components: 32
## PCA:    the sum of all selected genotypes (0,1,2) = 57125
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:26:11 2021    (internal increment: 63232)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:11 2021    Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:26:11 2021    Done.
pc.percent.juvenile <- pca.juvenile$varprop*100
head(round(pc.percent.juvenile, 2))
## [1] 19.25 12.89  8.97  7.25  7.08  6.33
tab.juvenile <- data.frame(sample.id = pca.juvenile$sample.id,
    EV1 = pca.juvenile$eigenvect[,1],    # the first eigenvector
    EV2 = pca.juvenile$eigenvect[,2],    # the second eigenvector
    EV3 = pca.juvenile$eigenvect[,3],    # the third eigenvector
    EV4 = pca.juvenile$eigenvect[,4],    # the fourth eigenvector
    EV5 = pca.juvenile$eigenvect[,5],    # the fourth eigenvector
    stringsAsFactors = FALSE)
tab.juvenile.annot <- left_join(tab.juvenile, sample.info.juvenile, by=c("sample.id"="sample")) %>% droplevels()
group.juvenile <- as.factor(tab.juvenile.annot$pop_code)

lbls <- paste("PC", 1:5, "\n", format(pc.percent.juvenile[1:5], digits=2), "%", sep="")
pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")

pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pop_code), labels=lbls, main="colors = pop & pCO2")

Interactive PCAs

# PC1 x PC2
# uncomment out the ggplotly lines to interact
ggplotly( 
    ggplot(tab.juvenile.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\njuvenile Genetic Structure (pooled batches)"))
#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly( 
    ggplot(tab.juvenile.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\njuvenile Genetic Structure (pooled batches)"))
#PC2 x PC3
# uncomment out the ggplotly lines to interact
ggplotly( 
    ggplot(tab.juvenile.annot, aes(x=EV3, y=EV2, col=population, shape=pCO2.parent)) + 
         geom_point(size=4) + theme_minimal() + ggtitle("PC2 x PC3\njuvenile Genetic Structure (pooled batches)"))

Juvenile cluster analysis

# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances, 
# and determine the groups by a permutation score. 
ibs.hc.juvenile <- snpgdsHCluster(snpgdsIBS(genofile.juvenile, num.thread=2,  snp.id=snpset.juvenile.id, autosome.only=FALSE, 
                                         sample.id=setdiff(tab.juvenile.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 16
##     # of SNPs: 2,586
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 57125
## Tue Apr 20 20:26:11 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:11 2021    Done.
rv.juvenile <- snpgdsCutTree(ibs.hc.juvenile)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")

# Determine groups of individuals by population information
rv2.juvenile <- snpgdsCutTree(ibs.hc.juvenile, samp.group=as.factor(subset(tab.juvenile.annot, sample.id!="044")$pop_code))
## Create 4 groups.
plot(rv2.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.juvenile), col=1:nlevels(group.juvenile), pch=19, ncol=4, cex=0.6, pt.cex=1)

## Juvenile Allele frequencies

AF.juvenile.FB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.juvenile.FB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)
AF.juvenile.DB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
                        with.id = TRUE)
AF.juvenile.DB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
                        with.id = TRUE)

AF.juveniles <- rbind(do.call(cbind.data.frame, AF.juvenile.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.juvenile.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")), 
do.call(cbind.data.frame, AF.juvenile.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("high")), 
do.call(cbind.data.frame, AF.juvenile.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")))

ggplot(AF.juveniles, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
  ggtitle(paste("SNP Allele Frequency, juveniles by population & pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
    geom_text(data=AF.juveniles %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.

Juvenile Fst Calculations

# within FB (comparing pCO2 exposure): 
Fst.juvenile.FB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay")$sample.id, 
               population=as.factor(subset(tab.juvenile.annot, population=="Fidalgo Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 251 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 8
##     # of SNPs: 2,335
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.juvenile.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High:  0.03405"
hist(Fst.juvenile.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, juvenile", col="dodgerblue3", xlim = c(-.15, 0.8))

# within DB (comparing pCO2 exposure): 
Fst.juvenile.DB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id, 
                        sample.id=subset(tab.juvenile.annot, population=="Dabob Bay")$sample.id, 
               population=as.factor(subset(tab.juvenile.annot, population=="Dabob Bay")$pCO2.parent),
    method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 276 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 8
##     # of SNPs: 2,310
## Method: Weir & Cockerham, 1984
## # of Populations: 2
##     Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.juvenile.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High:  0.1068"
hist(Fst.juvenile.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, juvenile", col="darkseagreen3", xlim = c(-.2, .8))

Fst.juvenile <- rbind(do.call(cbind.data.frame, Fst.juvenile.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile")), 
do.call(cbind.data.frame, Fst.juvenile.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"))) 

ggplot(Fst.juvenile, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) + 
  stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
  ggtitle(paste("SNP Fst, juvenile by pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
    geom_text(data=Fst.juvenile %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=1.1, label=round(mean, 3))) +
  theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.

Juvenile Identify-by-state

ibs.juvenile <- snpgdsIBS(genofile.juvenile, num.thread=2, snp.id=snpset.juvenile.id, autosome.only=FALSE, sample.id=setdiff(tab.juvenile.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
##     # of samples: 16
##     # of SNPs: 2,586
##     using 2 threads
## IBS:    the sum of all selected genotypes (0,1,2) = 57125
## Tue Apr 20 20:26:12 2021    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:12 2021    Done.
pop.idx.juvenile <- order(tab.juvenile.annot$pop_code)

#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.juvenile <- cmdscale(1 - ibs.juvenile$ibs, k = 2)
x.juvenile <- loc.juvenile[, 1]; y.juvenile <- loc.juvenile[, 2]
a <- as.factor(subset(tab.juvenile.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.juvenile.annot, sample.id!="044")$pCO2.parent))

plot(x.juvenile, y.juvenile, col=a, pch=b, xlab = "", ylab = "",
    main = "Multidimensional Scaling Analysis (IBS)", cex=1)

#text(x.juvenile, y.juvenile+.012, labels=tab.juvenile.annot$sample.id, cex=0.7, font=2, col=as.integer(group.juvenile))
#legend("topleft", legend=levels(group.juvenile), pch="o", cex=0.8) #text.col=1:nlevels(group.juvenile), 

Merge all Allele Frequency and Fst calculations together and plot

Fst.all <- rbind(Fst.adults, Fst.larvae, Fst.juvenile)
Fst.means <- Fst.all %>% select(-snp.id) %>% group_by(population, stage) %>% summarize(mean=mean(FstSNP), sd=sd(FstSNP), n=n())
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
AF.all <- rbind(AF.adults, AF.larvae, AF.juveniles)
AF.means <- AF.all %>% select(-snp.id) %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq), sd=sd(AlleleFreq), n=n())
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
ggplot(AF.all, aes(x=stage, y=AlleleFreq, fill=pCO2, alpha=stage)) + geom_violin(trim = T) + theme_minimal()  +   
  ggtitle("Allele Frequency\nBy population, stage, &  adult pCO2 treatment") + xlab("") + ylab("Allele Frequency")  + facet_wrap(~population) +scale_alpha_discrete(range = c(0.85, 0.15)) #+
## Warning: Using alpha for a discrete variable is not advised.

#  g0eom_point(data=AF.all %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq)), 
#             aes(x=stage:pCO2, y=mean, label=round(mean, 3)), cex=2.25, inherit.aes=T, position = position_dodge(width=1))

ggplot(Fst.all, aes(x=stage, y=FstSNP, fill=stage)) + geom_violin(trim = T, alpha=0.4) + theme_minimal() + facet_wrap(~ population) +   
    stat_summary(fun.y=mean, geom="point", aes(group=stage), shape=8, size=2, color="black", fill="black") +
  ggtitle("Per-SNP Fst among pCO2 treatments\nBy population and stage") + xlab("") + ylab("Per-Locus Fst") +
  geom_text(data=Fst.means, aes(group=stage, y=-.33, label=round(mean, 3)), cex=3.25)
## Warning: `fun.y` is deprecated. Use `fun` instead.

ggplot(Fst.all %>% filter(population==c("FB","DB")), aes(x=FstSNP, fill=stage)) + geom_density(alpha=0.4) + theme_minimal()  +   
  ggtitle("Per-SNP Fst among adult pCO2 treatments (high vs. ambient)\nBy population & stage") + xlab("") + ylab("Per-Locus Fst")  + facet_wrap(~population)

# More analysis via vcfR - compare allelic richness and heterozygosity by population, stage and parental pCO2 exposure

First filter the vcf files for each stage (adult, larvae, adult) to remove loci with tons if missing data. Do this via vcftools.

I used this tutorial as an example. - call vcftools - feed it a vcf file after the –vcf flag - Set –max-missing 0.90 to keep only loci with 10% missing genotypes across all individuals - The –recode flag tells the program to write a new vcf file with the filters - The –recode-INFO-all keeps all the INFO flags from the old vcf file in the new one. - Lastly, –out designates the name of the output

# adults 
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR"
## 
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
## 
## Parameters as interpreted:
##  --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
##  --recode-INFO-all
##  --max-missing 0.85
##  --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR
##  --recode
## 
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 53 out of 53 Individuals
## Outputting VCF file...
## After filtering, kept 45063 out of a possible 179693 Sites
## Run Time = 8.00 seconds
# larvae 
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR"
## 
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
## 
## Parameters as interpreted:
##  --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
##  --recode-INFO-all
##  --max-missing 0.85
##  --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR
##  --recode
## 
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 76 out of 76 Individuals
## Outputting VCF file...
## After filtering, kept 6029 out of a possible 131140 Sites
## Run Time = 4.00 seconds
# juveniles 
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR"
## 
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
## 
## Parameters as interpreted:
##  --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
##  --recode-INFO-all
##  --max-missing 0.85
##  --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR
##  --recode
## 
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 16 out of 16 Individuals
## Outputting VCF file...
## After filtering, kept 33681 out of a possible 51818 Sites
## Run Time = 2.00 seconds

Adults

require(vcfR)
# Load the data
vcf.adult <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR.recode.vcf", verbose = FALSE )

# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe 
colnames(extract.gt(vcf.adult)) == sample.info.adult$sample #yes, all are TRUE
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups 
myDiff.adult <- genetic_diff(vcf.adult, pops = sample.info.adult$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, adults ", nrow(myDiff.adult), sep="")
## [1] "No. SNPs included in this vcfR analysis, adults 45063"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.adult[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient 0.128
Hs_Dabob Bay-High 0.129
Hs_Fidalgo Bay-Ambient 0.123
Hs_Fidalgo Bay-High 0.121
Hs_Oyster Bay C1-Ambient 0.130
Hs_Oyster Bay C1-High 0.123
Ht 0.141
n_Dabob Bay-Ambient 17.007
n_Dabob Bay-High 14.722
n_Fidalgo Bay-Ambient 16.442
n_Fidalgo Bay-High 16.319
n_Oyster Bay C1-Ambient 17.279
n_Oyster Bay C1-High 16.834
Gst 0.087
Htmax 0.853
Gstmax 0.857
Gprimest 0.109
# Violin plots of Heterozygosity 
dpf.adult <- melt(myDiff.adult[,c(3:8)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults \nNo. SNPs =", nrow(myDiff.adult))) +  
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# Boxplots of allelic richness 
dpf.n.adult <- melt(myDiff.adult[,c(10:15)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Adults \nNo. SNPs =", nrow(myDiff.adult))) +  
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.n.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=18.5, label=round(mean, 2)), size=3) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

Larvae

# Load the data
vcf.larvae <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR.recode.vcf", verbose = FALSE )

# Write script to order sample.info dataframe to have same order as vcf  
colnames(extract.gt(vcf.larvae)) == sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$sample #yes, all are TRUE
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups 
myDiff.larvae <- genetic_diff(vcf.larvae, pops = sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.larvae), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 6029"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.larvae[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient 0.081
Hs_Dabob Bay-High 0.082
Hs_Fidalgo Bay-Ambient 0.084
Hs_Fidalgo Bay-High 0.090
Hs_Oyster Bay C1-Ambient 0.068
Hs_Oyster Bay C1-High 0.069
Hs_Oyster Bay C2-Ambient 0.068
Hs_Oyster Bay C2-High 0.063
Ht 0.081
n_Dabob Bay-Ambient 9.268
n_Dabob Bay-High 9.608
n_Fidalgo Bay-Ambient 11.806
n_Fidalgo Bay-High 19.365
n_Oyster Bay C1-Ambient 27.795
n_Oyster Bay C1-High 28.190
n_Oyster Bay C2-Ambient 13.528
n_Oyster Bay C2-High 14.253
Gst 0.082
Htmax 0.862
Gstmax 0.916
Gprimest 0.092
# Violin plots of Heterozygosity
dpf.larvae <- melt(myDiff.larvae[,c(3:10)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3)  + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# Boxplots of allelic richness 
dpf.n.larvae <- melt(myDiff.larvae[,c(12:19)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.n.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=3, label=round(mean, 2)), size=3)  + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# 1-yr old “juveniles”

# Load the data
vcf.juvenile <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR.recode.vcf", verbose = FALSE )

# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe 
colnames(extract.gt(vcf.juvenile)) == sample.info.juvenile$sample #yes, all are TRUE
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups 
myDiff.juvenile <- genetic_diff(vcf.juvenile, pops = sample.info.juvenile$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.juvenile), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 33681"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.juvenile[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient 0.189
Hs_Dabob Bay-High 0.197
Hs_Fidalgo Bay-Ambient 0.198
Hs_Fidalgo Bay-High 0.195
Ht 0.246
n_Dabob Bay-Ambient 7.637
n_Dabob Bay-High 7.838
n_Fidalgo Bay-Ambient 7.741
n_Fidalgo Bay-High 7.425
Gst 0.185
Htmax 0.797
Gstmax 0.762
Gprimest 0.251
# Violin plots of Heterozygosity
dpf.juvenile <- melt(myDiff.juvenile[,c(3:6)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3)  + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# Boxplots of allelic richness 
dpf.n.juvenile <- melt(myDiff.juvenile[,c(8:11)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
  mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
  mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.n.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=8.2, label=round(mean, 2)), size=3)  + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

myDiff.AL <- inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS"))  #7,202 loci common among all
paste("No. SNPs shared among adults and larvae: ", nrow(myDiff.AL), sep="")
## [1] "No. SNPs shared among adults and larvae: 1397"
# Compare adults vs. larvae 
knitr::kable(round(colMeans(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS"))[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient.x 0.135
Hs_Dabob Bay-High.x 0.118
Hs_Fidalgo Bay-Ambient.x 0.111
Hs_Fidalgo Bay-High.x 0.117
Hs_Oyster Bay C1-Ambient.x 0.115
Hs_Oyster Bay C1-High.x 0.106
Ht.x 0.132
n_Dabob Bay-Ambient.x 17.311
n_Dabob Bay-High.x 14.454
n_Fidalgo Bay-Ambient.x 16.500
n_Fidalgo Bay-High.x 16.839
n_Oyster Bay C1-Ambient.x 17.270
n_Oyster Bay C1-High.x 17.092
Gst.x 0.094
Htmax.x 0.851
Gstmax.x 0.866
Gprimest.x 0.114
Hs_Dabob Bay-Ambient.y 0.111
Hs_Dabob Bay-High.y 0.111
Hs_Fidalgo Bay-Ambient.y 0.110
Hs_Fidalgo Bay-High.y 0.111
Hs_Oyster Bay C1-Ambient.y 0.096
Hs_Oyster Bay C1-High.y 0.095
Hs_Oyster Bay C2-Ambient 0.090
Hs_Oyster Bay C2-High 0.088
Ht.y 0.109
n_Dabob Bay-Ambient.y 9.244
n_Dabob Bay-High.y 9.618
n_Fidalgo Bay-Ambient.y 11.810
n_Fidalgo Bay-High.y 19.373
n_Oyster Bay C1-Ambient.y 27.993
n_Oyster Bay C1-High.y 28.306
n_Oyster Bay C2-Ambient 13.489
n_Oyster Bay C2-High 14.109
Gst.y 0.090
Htmax.y 0.865
Gstmax.y 0.888
Gprimest.y 0.103
# Heterozygosity 
dpf.AL <- 
  melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(3:8,20:27)], 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
  mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
  mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## Warning: `mutate_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
ggplot(subset(dpf.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
  scale_colour_manual(values=c("black", "dodgerblue3")) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).

# Allelic Richness 
dpf.n.AL <- melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(10:15,29:36)], 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
  mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
  mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
ggplot(subset(dpf.n.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=population, alpha=pCO2, col=stage), width=1) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1.3)) + ggtitle(paste("Allelic Richness, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=dpf.n.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=2, label=round(mean, 2)), size=3) +
  scale_colour_manual(values=c("black", "dodgerblue3")) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).

Compare adults and 1-yr old offspring

myDiff.AJ <- inner_join(myDiff.adult, myDiff.juvenile, by=c("CHROM", "POS"))
paste("No. SNPs shared among adults and juveniles: ", nrow(myDiff.AJ), sep="")
## [1] "No. SNPs shared among adults and juveniles: 4352"
# Compare adults vs. larvae 
knitr::kable(round(colMeans(myDiff.AJ[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient.x 0.203
Hs_Dabob Bay-High.x 0.198
Hs_Fidalgo Bay-Ambient.x 0.192
Hs_Fidalgo Bay-High.x 0.200
Hs_Oyster Bay C1-Ambient 0.192
Hs_Oyster Bay C1-High 0.184
Ht.x 0.219
n_Dabob Bay-Ambient.x 17.248
n_Dabob Bay-High.x 14.235
n_Fidalgo Bay-Ambient.x 16.196
n_Fidalgo Bay-High.x 16.919
n_Oyster Bay C1-Ambient 17.188
n_Oyster Bay C1-High 17.011
Gst.x 0.099
Htmax.x 0.864
Gstmax.x 0.779
Gprimest.x 0.135
Hs_Dabob Bay-Ambient.y 0.182
Hs_Dabob Bay-High.y 0.190
Hs_Fidalgo Bay-Ambient.y 0.200
Hs_Fidalgo Bay-High.y 0.188
Ht.y 0.243
n_Dabob Bay-Ambient.y 7.515
n_Dabob Bay-High.y 7.822
n_Fidalgo Bay-Ambient.y 7.842
n_Fidalgo Bay-High.y 7.394
Gst.y 0.193
Htmax.y 0.795
Gstmax.y 0.767
Gprimest.y 0.259
# Heterozygosity 
dpf.AJ <- melt(select(myDiff.AJ,contains("Hs")), 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
  mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
  mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
  scale_colour_manual(values=c("black", "darkred")) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# Allelic Richness 
dpf.n.AJ <- melt(select(myDiff.AJ,contains("n_")), 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
  mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
  mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
  scale_colour_manual(values=c("black", "darkred")) + 
  scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

Compare adults, larvae, and 1-yr old offspring

myDiff.ALJ <- inner_join(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")), myDiff.juvenile,  by=c("CHROM", "POS")) 
paste("No. SNPs shared among adults, larvae, and juveniles: ", nrow(myDiff.ALJ), sep="")
## [1] "No. SNPs shared among adults, larvae, and juveniles: 412"
# Compare adults vs. larvae 
knitr::kable(round(colMeans(myDiff.ALJ[,-1:-2], na.rm = TRUE), digits = 3))
x
Hs_Dabob Bay-Ambient.x 0.188
Hs_Dabob Bay-High.x 0.169
Hs_Fidalgo Bay-Ambient.x 0.142
Hs_Fidalgo Bay-High.x 0.150
Hs_Oyster Bay C1-Ambient.x 0.141
Hs_Oyster Bay C1-High.x 0.130
Ht.x 0.173
n_Dabob Bay-Ambient.x 17.398
n_Dabob Bay-High.x 14.296
n_Fidalgo Bay-Ambient.x 16.379
n_Fidalgo Bay-High.x 16.995
n_Oyster Bay C1-Ambient.x 17.194
n_Oyster Bay C1-High.x 17.121
Gst.x 0.102
Htmax.x 0.857
Gstmax.x 0.825
Gprimest.x 0.130
Hs_Dabob Bay-Ambient.y 0.150
Hs_Dabob Bay-High.y 0.154
Hs_Fidalgo Bay-Ambient.y 0.151
Hs_Fidalgo Bay-High.y 0.166
Hs_Oyster Bay C1-Ambient.y 0.117
Hs_Oyster Bay C1-High.y 0.115
Hs_Oyster Bay C2-Ambient 0.114
Hs_Oyster Bay C2-High 0.110
Ht.y 0.143
n_Dabob Bay-Ambient.y 9.194
n_Dabob Bay-High.y 9.476
n_Fidalgo Bay-Ambient.y 12.010
n_Fidalgo Bay-High.y 19.427
n_Oyster Bay C1-Ambient.y 27.607
n_Oyster Bay C1-High.y 28.364
n_Oyster Bay C2-Ambient 13.495
n_Oyster Bay C2-High 14.689
Gst.y 0.097
Htmax.y 0.870
Gstmax.y 0.853
Gprimest.y 0.115
Hs_Dabob Bay-Ambient 0.158
Hs_Dabob Bay-High 0.192
Hs_Fidalgo Bay-Ambient 0.170
Hs_Fidalgo Bay-High 0.150
Ht 0.214
n_Dabob Bay-Ambient 7.379
n_Dabob Bay-High 7.709
n_Fidalgo Bay-Ambient 7.854
n_Fidalgo Bay-High 7.345
Gst 0.193
Htmax 0.789
Gstmax 0.793
Gprimest 0.249
# Heterozygosity 
dpf.ALJ <- melt(select(myDiff.ALJ,contains("Hs")) %>%  rename_at(vars(`Hs_Dabob Bay-Ambient`:`Hs_Fidalgo Bay-High`), paste0, ".z"), 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
    mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
    mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
    mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
  mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.ALJ, population!="Oyster Bay C2")%>% droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=stage, alpha=pCO2), adjust = 1.2) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
    geom_text(data=subset(dpf.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), 
              aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
  scale_alpha_discrete(guide="none") + 
  annotate("text", x = 3, y = .6, label = "Dabob Bay", size=3.5) +
  annotate("text", x = 9, y = .6, label = "Fidalgo Bay", size=3.5) +
  annotate("text", x = 14.5, y = .6, label = "Oyster Bay", size=3.5) +
  geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) + 
  scale_x_discrete(labels=rep(c("Ambient", "High"), times=8)) 
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.

# Allelic Richness 
dpf.n.ALJ <- melt(select(myDiff.ALJ,contains("n_")) %>%  rename_at(vars(`n_Dabob Bay-Ambient`:`n_Fidalgo Bay-High`), paste0, ".z"), 
               varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
    mutate(variable=gsub("\\.x", ".adult", variable)) %>%
    mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
    mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
    mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
    mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
  mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) + 
  geom_violin(aes(fill=stage, alpha=pCO2),width=1.3) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
    theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) + 
  scale_alpha_discrete(range = c(0.35, 0.9)) +
        #stat_summary(fun.y=mean, geom="point", shape=15, size=2, color="black", fill="black") +
    geom_text(data=subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), 
              aes(x=population:stage:pCO2, y=3, label=round(mean, 2)), size=2.5) +
  #scale_colour_manual(values=c("black", "dodgerblue3", "darkred")) + 
  scale_alpha_discrete(guide="none") + geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) + 
  annotate("text", x = 3, y = 30, label = "Dabob Bay", size=3.5) +
  annotate("text", x = 9, y = 30, label = "Fidalgo Bay", size=3.5) +
  annotate("text", x = 13.5, y = 30, label = "Oyster\nBay", size=3.5) +
  scale_x_discrete(labels=rep(c("Ambient", "High"), times=8))
## Warning: Using alpha for a discrete variable is not advised.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: position_dodge requires non-overlapping x intervals

Identify siblings using Colony

I will use Colony to identify siblings in my samples. The Colony software is only available via the command line for Mac users (it’s typically used via GUI on Windows). To get the most current version of the program I actually had to email the developer. He send me a link to download it. I’m running Colony Version 2.0.6.6 (June 30, 2020), which is saved on my computer and in this project’s repo (references/colony2.mac.20200904).

Prepare files for Colony sibship analysis Colony identifies siblings, which I would like to do with my juveniles. Here I prepare files for Colony.

Mac suggested that I only keep SNPs with very high minor allele frequencies. Here I use SNPRelate to export SNPs with MAF >=2%.

Adult Sibship analysis via Colony

snpset.adult.4Colony <- snpgdsLDpruning(genofile.adult, maf=.2, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,997 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0)
##     # of samples: 53
##     # of SNPs: 246
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 0.11%, 23/21,863
## Chromosome Contig104153_109739: 0.08%, 2/2,360
## Chromosome Contig109741_116186: 0.16%, 3/1,861
## Chromosome Contig11179_21262: 0.16%, 25/15,668
## Chromosome Contig116188_123130: 0.06%, 1/1,794
## Chromosome Contig123131_129982: 0.06%, 1/1,787
## Chromosome Contig129983_136896: 0.06%, 1/1,798
## Chromosome Contig136897_143777: 0.05%, 1/1,887
## Chromosome Contig150711_166372: 0.13%, 2/1,485
## Chromosome Contig166374_181896: 0.11%, 2/1,875
## Chromosome Contig181898_197428: 0.13%, 2/1,516
## Chromosome Contig21263_28135: 0.15%, 27/18,170
## Chromosome Contig213221_398012: 0.16%, 3/1,819
## Chromosome Contig28136_34745: 0.12%, 19/15,929
## Chromosome Contig34748_41067: 0.20%, 27/13,582
## Chromosome Contig398123_696856: 0.17%, 3/1,721
## Chromosome Contig41068_47193: 0.08%, 9/11,130
## Chromosome Contig47194_53180: 0.07%, 6/9,167
## Chromosome Contig5314_11178: 0.07%, 4/5,362
## Chromosome Contig53181_59083: 0.05%, 4/7,588
## Chromosome Contig59084_64831: 0.11%, 7/6,364
## Chromosome Contig64832_70524: 0.20%, 11/5,633
## Chromosome Contig70525_76135: 0.15%, 7/4,689
## Chromosome Contig76136_81765: 0.12%, 5/4,040
## Chromosome Contig81766_87352: 0.10%, 4/4,163
## Chromosome Contig87353_92929: 0.18%, 5/2,773
## Chromosome Contig92931_98526: 0.15%, 4/2,712
## Chromosome Contig98527_104152: 0.13%, 4/2,984
## 212 markers are selected in total.
snpset.adult.id.4Colony <- unlist(unname(snpset.adult.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-adult.gds",
                    dest.fn="../qc-processing/colony/genotypes-adult-4Colony.gds",
                    snp.id=snpset.adult.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 53 samples and 212 SNPs
##     write sample.id
##     write snp.id
##     write snp.rs.id
##     write snp.position
##     write snp.chromosome
##     write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Recode genotypes for Colony

  • SNPRelate codes alleles this way: “There are four possible values stored in the variable genotype: 0, 1, 2 and 3. For bi-allelic SNP sites, “0” indicates two B alleles, “1” indicates one A allele and one B allele, “2” indicates two A alleles, and “3” is a missing genotype. (When you) take out genotype data with sample and SNP IDs, and four possible values are returned 0, 1, 2 and NA (3 is replaced by NA).

  • Colony codes alleles this way: “The first column gives the individual ID (a string containing a maximum of 20 letters and/or numbers, no other characters are allowed), the 2nd and 3rd columns give the alleles observed for the individual at the first locus, the 4th and 5th give the alleles observed for the individual at the 2nd locus … An allele is identified by an integer, in the range of 1~999999999. If the locus is a dominant marker, then only one (instead of 2) column is required for the marker, and the value for the genotype should be either 1 (dominant phenotype, presence of a band) or 2 (recessive phenotype, absence of a band). Missing genotypes are indicated by 0 0 for a codominant marker and 0 for a dominant marker.” Also, there cannot be any NA values in the Colony genotype data.

I wanted to use the R program radiator to create a Colony formatted genotype file. But after a TON of attempts I wasn’t able to get my SNPRelate .gds data into radiator directly nor could I successfully import a .vcf file.

So, here I manually re-code the genotype matrix.

# Extract genotype matrix 
geno.matrix4Colony.adult <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-adult-4Colony.gds", with.id=TRUE)
## Genotype matrix: 53 samples X 212 SNPs
paste("No. of adult SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.adult$genotype), sep="") 
## [1] "No. of adult SNPs for Colony sibship analysis: 212"
paste("No. of adult samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.adult$genotype), sep="") 
## [1] "No. of adult samples for Colony sibship analysis: 53"
# Replace SNPRelate numeric allele coding with character allele coding 
a <- geno.matrix4Colony.adult$genotype %>% as_data_frame() %>% 
  mutate_all(funs(str_replace(., "0", "BB"))) %>%
  mutate_all(funs(str_replace(., "1", "AB"))) %>%
  mutate_all(funs(str_replace(., "2", "AA")))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info. 
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE 
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2. 
c <- b  #duplicate dataframe 
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])

for (i in 1:length(odds)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)  
}
for (i in 1:length(evens)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)  
}

# Now replace the character allele information with Colony's way of numeric coding. 
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele 
# 2= A allele 
# 0= missing allele 
(geno.matrix4Colony.adult.recode <- c %>% 
  mutate_all(funs(str_replace(., "B", "1"))) %>%
  mutate_all(funs(str_replace(., "A", "2"))) %>%
  mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 53 x 424
##    v1    v1_2  v2    v2_2  v3    v3_2  v4    v4_2  v5    v5_2  v6    v6_2  v7   
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 2     1     2     2     2     1     2     2     2     1     1     1     2    
##  2 2     2     2     2     1     1     1     1     2     1     1     1     2    
##  3 2     2     2     1     2     1     2     1     2     2     2     2     2    
##  4 2     2     2     1     2     1     1     1     1     1     2     2     2    
##  5 2     2     1     1     2     2     2     1     2     1     2     2     1    
##  6 2     2     2     1     2     1     2     1     2     1     1     1     2    
##  7 2     2     2     2     2     1     2     1     2     2     2     2     2    
##  8 2     2     1     1     2     1     1     1     1     1     1     1     2    
##  9 2     2     1     1     2     1     1     1     1     1     1     1     2    
## 10 2     1     2     1     2     1     1     1     2     1     2     2     2    
## # … with 43 more rows, and 411 more variables: v7_2 <chr>, v8 <chr>,
## #   v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## #   v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## #   v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## #   v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## #   v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## #   v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## #   v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## #   v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## #   v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## #   v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## #   v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## #   v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## #   v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## #   v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## #   v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## #   v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## #   v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number 
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.adult$sample.id, sep=""),
                  geno.matrix4Colony.adult.recode),
            file="../qc-processing/colony/geno.matrix.adult.tab", delim="   ", col_names=FALSE)

Create Colony input file

I created a text file that will be the input for Colony. I used Colony’s example input file to do so.

Here’s a snapshot of the input file, truncating the genotype info (there are 53 samples, but here I only show 2)

head -n 28 ../qc-processing/colony/colony2_adults.dat
tail -n 12 ../qc-processing/colony/colony2_adults.dat
## 'adult_snps'  !Dataset name
## 'adult_colony'  !Output file name
## 53         ! Number of offspring in the sample
## 212        ! Number of loci
## 100        ! Seed for random number generator
## 0          ! 0/1=Not updating/updating allele frequency
## 2          ! 2/1=Dioecious/Monoecious species
## 0          ! 0/1=No inbreeding/inbreeding
## 0          ! 0/1=Diploid species/HaploDiploid species
## 0  0       ! 0/1=Polygamy/Monogamy for males & females
## 0          ! 0/1=Clone inference =No/Yes
## 0          ! 0/1=Full sibship size scaling =No/Yes
## 0          ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0          ! 0/1=Unknown/Known population allele frequency
## 1         ! Number of runs
## 2         ! Length of run
## 0         ! 0/1=Monitor method by Iterate#/Time in second
## 100000    ! Monitor interval in Iterate# / in seconds
## 0         ! non-Windows version
## 1         ! Fulllikelihood
## 3         ! 1/2/3=low/medium/high Precision for Fulllikelihood
## 
## V@        !Marker names
## 0@        !Marker types, 0/1 = codominant/dominant
## 0.000@    !Allelic dropout rate  
## 0.0001@   !false allele rate
## 
## O291 2 1 2 2 2 1 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 1 1 2 2 2 1 1 1 2 2 2 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1
## O349 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1
## 
## 0  0     !prob. of dad/mum included in the candidates
## 0  0       !numbers of candiadte males & females
## 0  0          !#known fater-offspring dyads, paternity exclusion threshold
## 0  0          !#known moter-offspring dyads, maternity exclusion threshold
## 0             !#known paternal sibship with unknown fathers
## 0             !#known maternal sibship with unknown mothers
## 0             !#known paternity exclusions
## 0             !#known maternity exclusions
## 0             !#known paternal sibship exclusions
## 0             !#known maternal sibship exclusions

Run Colony analysis on adult samples

Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script

cd ../qc-processing/colony/
./colony2s.out IFN:colony2_adults.dat
cat ../qc-processing/colony/adult_colony.BestCluster
## ClusterIndex Probability          OffspringID             FatherID             MotherID
##            1      0.7953                 O291                   *1                   #1
##            1      0.7953                 O293                   *3                   #1
##            1      0.7953                 O294                   *3                   #3
##            1      0.7953                 O302                   *1                   #3
##            1      0.7953                 O305                   *1                   #3
##            1      0.7953                 O307                   *1                   #3
##            1      0.7953                 O309                   *1                   #3
##            2      0.9974                 O292                   *2                   #2
##            2      0.9974                 O296                   *2                   #5
##            2      0.9974                 O299                   *2                   #5
##            2      0.9974                 O301                   *2                   #2
##            3      0.9199                 O295                   *4                   #4
##            3      0.9199                 O303                   *4                   #4
##            3      0.9199                 O304                   *4                   #4
##            3      0.9199                 O308                   *4                   #4
##            4      0.9864                 O298                   *5                   #6
##            4      0.9864                 O306                   *6                   #6
##            5      0.9859                 O311                   *7                   #7
##            5      0.9859                 O315                   *7                  #11
##            5      0.9859                 O318                   *7                  #12
##            5      0.9859                 O328                   *7                  #11
##            5      0.9859                 O329                   *7                  #11
##            6      0.7866                 O312                   *8                   #8
##            6      0.7866                 O314                   *8                  #10
##            6      0.7866                 O316                   *8                   #8
##            6      0.7866                 O317                   *8                  #10
##            6      0.7866                 O321                   *8                  #10
##            6      0.7866                 O323                   *8                   #8
##            6      0.7866                 O327                   *8                  #10
##            7      0.2345                 O313                   *9                   #9
##            7      0.2345                 O319                  *10                  #13
##            7      0.2345                 O322                   *9                  #13
##            7      0.2345                 O324                   *9                   #9
##            7      0.2345                 O326                   *9                  #13
##            8      0.3050                 O325                  *11                  #14
##            8      0.3050                 O333                  *14                  #17
##            8      0.3050                 O337                  *14                  #14
##            9      0.9090                 O331                  *12                  #15
##            9      0.9090                 O336                  *12                  #19
##            9      0.9090                 O341                  *17                  #15
##            9      0.9090                 O342                  *12                  #22
##            9      0.9090                 O345                  *19                  #22
##            9      0.9090                 O346                  *19                  #15
##            9      0.9090                 O348                  *12                  #22
##           10      0.9796                 O332                  *13                  #16
##           10      0.9796                 O334                  *15                  #16
##           10      0.9796                 O339                  *13                  #21
##           10      0.9796                 O347                  *13                  #16
##           11      0.9952                 O335                  *16                  #18
##           11      0.9952                 O338                  *16                  #20
##           11      0.9952                 O343                  *16                  #20
##           12      0.9870                 O344                  *18                  #23
##           12      0.9870                 O349                  *18                  #23
colony.clusters.adult <- read_table("../qc-processing/colony/adult_colony.BestCluster", col_names=TRUE) %>% clean_names() %>% 
  mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
  mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
  mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.adult, by = c("offspring_id"="sample"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ClusterIndex = col_double(),
##   Probability = col_double(),
##   OffspringID = col_character(),
##   FatherID = col_character(),
##   MotherID = col_character()
## )
save(colony.clusters.adult, file = "../qc-processing/colony/best-cluster-adult")  

# Plot mother ~ father ID 
#ggplotly(
  ggplot(colony.clusters.adult, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none") #)

# How many fathers and mothers does each population + pCO2 group have? 
colony.clusters.adult %>%
  group_by(population, pCO2.parent) %>%
  summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) 
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups:   population [3]
##   population    pCO2.parent n_fathers n_mothers
##   <chr>         <chr>           <int>     <int>
## 1 Dabob Bay     Ambient             4         4
## 2 Dabob Bay     High                5         6
## 3 Fidalgo Bay   Ambient             6         5
## 4 Fidalgo Bay   High                5         8
## 5 Oyster Bay C1 Ambient             4         6
## 6 Oyster Bay C1 High                4         7
# How many fathers and mothers does each population have
colony.clusters.adult %>%
  group_by(population) %>%
  summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) 
## # A tibble: 3 x 3
##   population    n_fathers n_mothers
##   <chr>             <int>     <int>
## 1 Dabob Bay             6         6
## 2 Fidalgo Bay           8        10
## 3 Oyster Bay C1         5         8

Larval Sibship analysis via Colony

snpset.larvae.4Colony <- snpgdsLDpruning(genofile.larvae, maf=.2, missing.rate=0.15, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 127,940 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0.15)
##     # of samples: 76
##     # of SNPs: 282
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 0.13%, 20/15,947
## Chromosome Contig104153_109739: 0.25%, 4/1,625
## Chromosome Contig109741_116186: 0.15%, 2/1,328
## Chromosome Contig11179_21262: 0.21%, 26/12,305
## Chromosome Contig116188_123130: 0.50%, 6/1,207
## Chromosome Contig123131_129982: 0.27%, 3/1,099
## Chromosome Contig129983_136896: 0.33%, 4/1,222
## Chromosome Contig136897_143777: 0.23%, 3/1,321
## Chromosome Contig143780_150710: 0.25%, 3/1,179
## Chromosome Contig150711_166372: 0.19%, 2/1,031
## Chromosome Contig166374_181896: 0.09%, 1/1,077
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig197431_213218: 0.18%, 2/1,107
## Chromosome Contig21263_28135: 0.18%, 24/13,677
## Chromosome Contig213221_398012: 0.17%, 2/1,191
## Chromosome Contig28136_34745: 0.17%, 22/12,701
## Chromosome Contig34748_41067: 0.14%, 15/10,383
## Chromosome Contig41068_47193: 0.16%, 13/7,913
## Chromosome Contig47194_53180: 0.13%, 9/6,968
## Chromosome Contig5314_11178: 0.14%, 5/3,691
## Chromosome Contig53181_59083: 0.19%, 11/5,690
## Chromosome Contig59084_64831: 0.23%, 10/4,432
## Chromosome Contig64832_70524: 0.30%, 12/4,031
## Chromosome Contig70525_76135: 0.32%, 11/3,438
## Chromosome Contig76136_81765: 0.07%, 2/2,871
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.31%, 6/1,952
## Chromosome Contig92931_98526: 0.42%, 8/1,900
## Chromosome Contig98527_104152: 0.10%, 2/1,979
## 232 markers are selected in total.
snpset.larvae.id.4Colony <- unlist(unname(snpset.larvae.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-larvae.gds",
                    dest.fn="../qc-processing/colony/genotypes-larvae-4Colony.gds",
                    snp.id=snpset.larvae.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 76 samples and 232 SNPs
##     write sample.id
##     write snp.id
##     write snp.rs.id
##     write snp.position
##     write snp.chromosome
##     write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Recode genotypes for Colony

# Extract genotype matrix 
geno.matrix4Colony.larvae <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-larvae-4Colony.gds", with.id=TRUE)
## Genotype matrix: 76 samples X 232 SNPs
paste("No. of larvae SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.larvae$genotype), sep="") 
## [1] "No. of larvae SNPs for Colony sibship analysis: 232"
paste("No. of larvae samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.larvae$genotype), sep="") 
## [1] "No. of larvae samples for Colony sibship analysis: 76"
# Replace SNPRelate numeric allele coding with character allele coding 
a <- geno.matrix4Colony.larvae$genotype %>% as_data_frame() %>% 
  mutate_all(funs(str_replace(., "0", "BB"))) %>%
  mutate_all(funs(str_replace(., "1", "AB"))) %>%
  mutate_all(funs(str_replace(., "2", "AA")))

# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info. 
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE 
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2. 
c <- b  #duplicate dataframe 
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])

for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)  
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)  
}

# Now replace the character allele information with Colony's way of numeric coding. 
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele 
# 2= A allele 
# 0= missing allele 
(geno.matrix4Colony.larvae.recode <- c %>% 
  mutate_all(funs(str_replace(., "B", "1"))) %>%
  mutate_all(funs(str_replace(., "A", "2"))) %>%
  mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 76 x 464
##    v1    v1_2  v2    v2_2  v3    v3_2  v4    v4_2  v5    v5_2  v6    v6_2  v7   
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 2     2     2     1     2     2     0     0     2     2     2     2     2    
##  2 0     0     2     2     1     1     0     0     0     0     0     0     2    
##  3 0     0     2     2     0     0     0     0     2     2     2     1     0    
##  4 2     2     2     1     2     2     2     2     2     2     1     1     2    
##  5 1     1     2     1     2     2     1     1     2     2     2     2     2    
##  6 0     0     2     2     2     2     1     1     2     2     2     2     2    
##  7 0     0     0     0     2     2     0     0     0     0     0     0     2    
##  8 0     0     0     0     0     0     0     0     2     2     2     2     2    
##  9 2     1     2     1     2     2     2     2     2     2     2     2     0    
## 10 2     1     2     1     2     2     2     2     1     1     2     2     2    
## # … with 66 more rows, and 451 more variables: v7_2 <chr>, v8 <chr>,
## #   v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## #   v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## #   v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## #   v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## #   v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## #   v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## #   v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## #   v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## #   v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## #   v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## #   v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## #   v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## #   v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## #   v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## #   v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## #   v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## #   v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number 
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.larvae$sample.id, sep=""),
                  geno.matrix4Colony.larvae.recode),
            file="../qc-processing/colony/geno.matrix.larvae.tab", delim="   ", col_names=FALSE)

Create Colony input file

I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.larvae.tab” file and pasted it into the text file.

Here’s a snapshot of the input file, truncating the genotype info (there are 76 samples, but here I only show 4)

head -n 30 ../qc-processing/colony/colony2_larvae.dat
tail -n 12 ../qc-processing/colony/colony2_larvae.dat
## 'larvae_snps'  !Dataset name
## 'larvae_colony'  !Output file name
## 76         ! Number of offspring in the sample
## 231        ! Number of loci
## 100        ! Seed for random number generator
## 0          ! 0/1=Not updating/updating allele frequency
## 2          ! 2/1=Dioecious/Monoecious species
## 0          ! 0/1=No inbreeding/inbreeding
## 0          ! 0/1=Diploid species/HaploDiploid species
## 0  0       ! 0/1=Polygamy/Monogamy for males & females
## 0          ! 0/1=Clone inference =No/Yes
## 0          ! 0/1=Full sibship size scaling =No/Yes
## 0          ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0          ! 0/1=Unknown/Known population allele frequency
## 1         ! Number of runs
## 2         ! Length of run
## 0         ! 0/1=Monitor method by Iterate#/Time in second
## 100000    ! Monitor interval in Iterate# / in seconds
## 0         ! non-Windows version
## 1         ! Fulllikelihood
## 3         ! 1/2/3=low/medium/high Precision for Fulllikelihood
## 
## V@        !Marker names
## 0@        !Marker types, 0/1 = codominant/dominant
## 0.000@    !Allelic dropout rate  
## 0.0001@   !false allele rate
## 
## O034 2 2 2 1 2 2 0 0 2 2 2 2 2 1 2 1 2 2 2 2 0 0 0 0 1 1 2 2 0 0 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 2 2 1 2 2 2 2 2 1 2 2 1 1 0 0 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 0 0 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 1 2 1 2 1 1 1 0 0 2 1 2 2 0 0 2 2 1 1 1 1 1 1 2 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 0 0 2 2 2 2 1 1 2 1 0 0 2 2 2 1 0 0 2 2 2 2 1 1 2 2 2 2 2 1 0 0 2 2 0 0 2 1 1 1 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 1 1 2 1 2 2 0 0 1 1 2 2 2 1 2 2 2 2 2 2 2 2 0 0 0 0 2 2 0 0 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 0 0 1 1 2 1 2 1 2 2 2 1 1 1 0 0 2 1 0 0 2 2 2 2 2 1 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 2 0 0 2 1 2 1 1 1 2 2 2 2 2 2 1 1 0 0 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 1 1 1 2 1 0 0 2 2 2 1 2 2 2 2 2 2 2 2 0 0 2 2 2 1 2 2 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 0 0 2 2 0 0 2 2 2 1 0 0 2 1 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 0 0 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 1 2 1 2 1 2 2 0 0 2 2 1 1 2 1 0 0
## O035 0 0 2 2 1 1 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 1 1 2 2 0 0 2 2 0 0 1 1 2 2 0 0 0 0 1 1 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 2 2
## O037 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 1 1 2 2 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 1 1 1 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 0 1 1 0 0 0 0 0 0 2 2 0 0 0 0 2 1 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 1 1 0 0 0 0 2 2 2 2 0 0 2 2 2 1 0 0 0 0 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 1 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 2 2 2 1 1 0 0 1 1 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0
## O565 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 1 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 1 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 2 2 1 1 0 0 1 1 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 1 1 1 1 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 1 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 2 2 2 2 2 0 0 2 2 2 2 1 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 0 0 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 1 2 1 0 0 2 2 1 1
## 
## 0  0     !prob. of dad/mum included in the candidates
## 0  0       !numbers of candiadte males & females
## 0  0          !#known fater-offspring dyads, paternity exclusion threshold
## 0  0          !#known moter-offspring dyads, maternity exclusion threshold
## 0             !#known paternal sibship with unknown fathers
## 0             !#known maternal sibship with unknown mothers
## 0             !#known paternity exclusions
## 0             !#known maternity exclusions
## 0             !#known paternal sibship exclusions
## 0             !#known maternal sibship exclusions

Run Colony analysis on larvae samples

Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script

cd ../qc-processing/colony/
./colony2s.out IFN:colony2_larvae.dat
cat ../qc-processing/colony/larvae_colony.BestCluster
## ClusterIndex Probability          OffspringID             FatherID             MotherID
##            1      0.6778                 O034                   *1                   #1
##            1      0.6778                 O037                   *3                   #1
##            1      0.6778                 O044                   *3                   #1
##            1      0.6778                 O046                   *7                   #1
##            1      0.6778                 O047                   *7                   #7
##            1      0.6778                 O483                   *3                  #23
##            1      0.6778                 O485                  *23                  #25
##            1      0.6778                 O487                  *23                  #25
##            1      0.6778                 O526                   *7                  #25
##            1      0.6778                 O528                  *23                  #32
##            2      1.0000                 O035                   *2                   #2
##            2      1.0000                 O412                   *2                   #2
##            2      1.0000                 O441                   *2                   #2
##            2      1.0000                 O565                   *2                   #2
##            3      0.1356                 O039                   *4                   #3
##            3      0.1356                 O484                  *22                  #24
##            3      0.1356                 O513                   *4                  #24
##            3      0.1356                 O531                  *22                  #34
##            3      0.1356                 O542                  *33                   #3
##            3      0.1356                 O543                  *22                  #36
##            3      0.1356                 O552                   *4                  #37
##            3      0.1356                 O553                  *34                  #36
##            4      0.8784                 O041                   *5                   #4
##            4      0.8784                 O043                   *5                   #5
##            5      0.4926                 O045                   *6                   #6
##            5      0.4926                 O489                  *24                  #26
##            5      0.4926                 O522                  *27                  #29
##            5      0.4926                 O523                  *28                  #30
##            5      0.4926                 O524                  *28                  #31
##            5      0.4926                 O525                  *28                   #6
##            5      0.4926                 O527                  *29                  #31
##            5      0.4926                 O562                  *27                  #30
##            5      0.4926                 O564                  *24                  #30
##            6      0.9721                 O401                   *8                   #8
##            6      0.9721                 O402                   *9                   #8
##            6      0.9721                 O403                  *10                   #8
##            6      0.9721                 O404                  *11                   #8
##            6      0.9721                 O411                  *12                   #8
##            6      0.9721                 O413                  *13                   #8
##            6      0.9721                 O414                   *8                   #9
##            6      0.9721                 O421                  *14                   #8
##            6      0.9721                 O431                  *15                   #8
##            6      0.9721                 O432                   *8                   #8
##            6      0.9721                 O434                  *15                  #10
##            6      0.9721                 O442                  *16                   #9
##            6      0.9721                 O444                  *18                   #9
##            6      0.9721                 O445                  *18                   #9
##            6      0.9721                 O451                  *18                  #12
##            6      0.9721                 O453                  *16                  #12
##            7      0.1504                 O443                  *17                  #11
##            7      0.1504                 O452                  *17                  #13
##            7      0.1504                 O472                  *17                  #17
##            7      0.1504                 O473                  *17                  #18
##            7      0.1504                 O475                  *17                  #18
##            7      0.1504                 O476                  *17                  #17
##            7      0.1504                 O477                  *17                  #20
##            7      0.1504                 O521                  *26                  #13
##            7      0.1504                 O554                  *26                  #38
##            8      0.8628                 O461                  *19                  #14
##            8      0.8628                 O462                  *19                  #15
##            8      0.8628                 O471                  *19                  #16
##            8      0.8628                 O474                  *19                  #19
##            9      0.8967                 O481                  *20                  #21
##            9      0.8967                 O492                  *20                  #27
##            9      0.8967                 O506                  *20                  #28
##            9      0.8967                 O532                  *31                  #35
##            9      0.8967                 O533                  *32                  #35
##            9      0.8967                 O551                  *31                  #27
##            9      0.8967                 O561                  *35                  #35
##            9      0.8967                 O563                  *31                  #27
##           10      0.9372                 O482                  *21                  #22
##           10      0.9372                 O488                  *21                  #22
##           10      0.9372                 O490                  *25                  #22
##           10      0.9372                 O491                  *25                  #22
##           10      0.9372                 O541                  *21                  #22
##           11      0.7313                 O529                  *30                  #33
(colony.clusters.larvae <- read_table("../qc-processing/colony/larvae_colony.BestCluster", col_names=TRUE) %>% clean_names() %>% 
  mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
  mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
  mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.larvae, by = c("offspring_id"="sample")))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ClusterIndex = col_double(),
##   Probability = col_double(),
##   OffspringID = col_character(),
##   FatherID = col_character(),
##   MotherID = col_character()
## )
##    cluster_index probability offspring_id father_id mother_id  stage
## 1              1      0.6778          034         1         1 larvae
## 2              1      0.6778          037         3         1 larvae
## 3              1      0.6778          044         3         1 larvae
## 4              1      0.6778          046         7         1 larvae
## 5              1      0.6778          047         7         7 larvae
## 6              1      0.6778          483         3        23 larvae
## 7              1      0.6778          485        23        25 larvae
## 8              1      0.6778          487        23        25 larvae
## 9              1      0.6778          526         7        25 larvae
## 10             1      0.6778          528        23        32 larvae
## 11             2      1.0000          035         2         2 larvae
## 12             2      1.0000          412         2         2 larvae
## 13             2      1.0000          441         2         2 larvae
## 14             2      1.0000          565         2         2 larvae
## 15             3      0.1356          039         4         3 larvae
## 16             3      0.1356          484        22        24 larvae
## 17             3      0.1356          513         4        24 larvae
## 18             3      0.1356          531        22        34 larvae
## 19             3      0.1356          542        33         3 larvae
## 20             3      0.1356          543        22        36 larvae
## 21             3      0.1356          552         4        37 larvae
## 22             3      0.1356          553        34        36 larvae
## 23             4      0.8784          041         5         4 larvae
## 24             4      0.8784          043         5         5 larvae
## 25             5      0.4926          045         6         6 larvae
## 26             5      0.4926          489        24        26 larvae
## 27             5      0.4926          522        27        29 larvae
## 28             5      0.4926          523        28        30 larvae
## 29             5      0.4926          524        28        31 larvae
## 30             5      0.4926          525        28         6 larvae
## 31             5      0.4926          527        29        31 larvae
## 32             5      0.4926          562        27        30 larvae
## 33             5      0.4926          564        24        30 larvae
## 34             6      0.9721          401         8         8 larvae
## 35             6      0.9721          402         9         8 larvae
## 36             6      0.9721          403        10         8 larvae
## 37             6      0.9721          404        11         8 larvae
## 38             6      0.9721          411        12         8 larvae
## 39             6      0.9721          413        13         8 larvae
## 40             6      0.9721          414         8         9 larvae
## 41             6      0.9721          421        14         8 larvae
## 42             6      0.9721          431        15         8 larvae
## 43             6      0.9721          432         8         8 larvae
## 44             6      0.9721          434        15        10 larvae
## 45             6      0.9721          442        16         9 larvae
## 46             6      0.9721          444        18         9 larvae
## 47             6      0.9721          445        18         9 larvae
## 48             6      0.9721          451        18        12 larvae
## 49             6      0.9721          453        16        12 larvae
## 50             7      0.1504          443        17        11 larvae
## 51             7      0.1504          452        17        13 larvae
## 52             7      0.1504          472        17        17 larvae
## 53             7      0.1504          473        17        18 larvae
## 54             7      0.1504          475        17        18 larvae
## 55             7      0.1504          476        17        17 larvae
## 56             7      0.1504          477        17        20 larvae
## 57             7      0.1504          521        26        13 larvae
## 58             7      0.1504          554        26        38 larvae
## 59             8      0.8628          461        19        14 larvae
## 60             8      0.8628          462        19        15 larvae
## 61             8      0.8628          471        19        16 larvae
## 62             8      0.8628          474        19        19 larvae
## 63             9      0.8967          481        20        21 larvae
## 64             9      0.8967          492        20        27 larvae
## 65             9      0.8967          506        20        28 larvae
## 66             9      0.8967          532        31        35 larvae
## 67             9      0.8967          533        32        35 larvae
## 68             9      0.8967          551        31        27 larvae
## 69             9      0.8967          561        35        35 larvae
## 70             9      0.8967          563        31        27 larvae
## 71            10      0.9372          482        21        22 larvae
## 72            10      0.9372          488        21        22 larvae
## 73            10      0.9372          490        25        22 larvae
## 74            10      0.9372          491        25        22 larvae
## 75            10      0.9372          541        21        22 larvae
## 76            11      0.7313          529        30        33 larvae
##       population pCO2.parent              pop_code
## 1  Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 2  Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 3  Oyster Bay C1        High    Oyster Bay C1-High
## 4  Oyster Bay C1        High    Oyster Bay C1-High
## 5  Oyster Bay C1        High    Oyster Bay C1-High
## 6  Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 7  Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 8  Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 9  Oyster Bay C1        High    Oyster Bay C1-High
## 10 Oyster Bay C1        High    Oyster Bay C1-High
## 11 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 12     Dabob Bay        High        Dabob Bay-High
## 13   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 14 Oyster Bay C2        High    Oyster Bay C2-High
## 15 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 16 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 17 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 18 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 19 Oyster Bay C2        High    Oyster Bay C2-High
## 20 Oyster Bay C2        High    Oyster Bay C2-High
## 21 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 22 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 23 Oyster Bay C1        High    Oyster Bay C1-High
## 24 Oyster Bay C1        High    Oyster Bay C1-High
## 25 Oyster Bay C1        High    Oyster Bay C1-High
## 26 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 27 Oyster Bay C1        High    Oyster Bay C1-High
## 28 Oyster Bay C1        High    Oyster Bay C1-High
## 29 Oyster Bay C1        High    Oyster Bay C1-High
## 30 Oyster Bay C1        High    Oyster Bay C1-High
## 31 Oyster Bay C1        High    Oyster Bay C1-High
## 32 Oyster Bay C2        High    Oyster Bay C2-High
## 33 Oyster Bay C2        High    Oyster Bay C2-High
## 34     Dabob Bay     Ambient     Dabob Bay-Ambient
## 35     Dabob Bay     Ambient     Dabob Bay-Ambient
## 36     Dabob Bay     Ambient     Dabob Bay-Ambient
## 37     Dabob Bay     Ambient     Dabob Bay-Ambient
## 38     Dabob Bay        High        Dabob Bay-High
## 39     Dabob Bay        High        Dabob Bay-High
## 40     Dabob Bay        High        Dabob Bay-High
## 41     Dabob Bay     Ambient     Dabob Bay-Ambient
## 42     Dabob Bay        High        Dabob Bay-High
## 43     Dabob Bay        High        Dabob Bay-High
## 44     Dabob Bay        High        Dabob Bay-High
## 45   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 46   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 47   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 48   Fidalgo Bay        High      Fidalgo Bay-High
## 49   Fidalgo Bay        High      Fidalgo Bay-High
## 50   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 51   Fidalgo Bay        High      Fidalgo Bay-High
## 52   Fidalgo Bay        High      Fidalgo Bay-High
## 53   Fidalgo Bay        High      Fidalgo Bay-High
## 54   Fidalgo Bay        High      Fidalgo Bay-High
## 55   Fidalgo Bay        High      Fidalgo Bay-High
## 56   Fidalgo Bay        High      Fidalgo Bay-High
## 57 Oyster Bay C1        High    Oyster Bay C1-High
## 58 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 59   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 60   Fidalgo Bay     Ambient   Fidalgo Bay-Ambient
## 61   Fidalgo Bay        High      Fidalgo Bay-High
## 62   Fidalgo Bay        High      Fidalgo Bay-High
## 63 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 64 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 65 Oyster Bay C1        High    Oyster Bay C1-High
## 66 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 67 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 68 Oyster Bay C2     Ambient Oyster Bay C2-Ambient
## 69 Oyster Bay C2        High    Oyster Bay C2-High
## 70 Oyster Bay C2        High    Oyster Bay C2-High
## 71 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 72 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 73 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 74 Oyster Bay C1     Ambient Oyster Bay C1-Ambient
## 75 Oyster Bay C2        High    Oyster Bay C2-High
## 76 Oyster Bay C1        High    Oyster Bay C1-High
save(colony.clusters.larvae, file = "../qc-processing/colony/best-cluster-larvae")  

# Plot mother ~ father ID 
ggplotly(
  ggplot(colony.clusters.larvae, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have? 
colony.clusters.larvae %>%
  group_by(population, pCO2.parent) %>%
  summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
  pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>% 
  mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
  ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
  scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") + 
  facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.

Juvenile Sibship analysis via Colony

snpset.juvenile.4Colony <- snpgdsLDpruning(genofile.juvenile, maf=.4, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 48,939 SNPs (monomorphic: TRUE, MAF: 0.4, missing rate: 0)
##     # of samples: 16
##     # of SNPs: 1,669
##     using 1 thread
##     sliding window: 500,000 basepairs, Inf SNPs
##     |LD| threshold: 0.2
##     method: composite
## Chromosome Contig0_5313: 1.78%, 102/5,731
## Chromosome Contig104153_109739: 1.60%, 12/750
## Chromosome Contig109741_116186: 2.09%, 14/670
## Chromosome Contig11179_21262: 1.73%, 78/4,498
## Chromosome Contig116188_123130: 2.41%, 14/580
## Chromosome Contig123131_129982: 1.42%, 6/422
## Chromosome Contig129983_136896: 1.49%, 8/536
## Chromosome Contig136897_143777: 1.95%, 13/667
## Chromosome Contig143780_150710: 2.06%, 10/486
## Chromosome Contig150711_166372: 2.20%, 10/455
## Chromosome Contig166374_181896: 2.36%, 13/550
## Chromosome Contig181898_197428: 2.36%, 10/423
## Chromosome Contig197431_213218: 2.13%, 12/564
## Chromosome Contig21263_28135: 1.94%, 97/4,994
## Chromosome Contig213221_398012: 2.16%, 11/510
## Chromosome Contig28136_34745: 1.68%, 80/4,772
## Chromosome Contig34748_41067: 1.83%, 70/3,817
## Chromosome Contig398123_696856: 1.89%, 9/476
## Chromosome Contig41068_47193: 1.72%, 55/3,201
## Chromosome Contig47194_53180: 2.16%, 61/2,819
## Chromosome Contig5314_11178: 1.77%, 25/1,414
## Chromosome Contig53181_59083: 1.89%, 48/2,534
## Chromosome Contig59084_64831: 2.01%, 36/1,792
## Chromosome Contig64832_70524: 1.93%, 32/1,654
## Chromosome Contig70525_76135: 2.10%, 28/1,331
## Chromosome Contig76136_81765: 1.81%, 24/1,329
## Chromosome Contig81766_87352: 2.08%, 25/1,203
## Chromosome Contig87353_92929: 1.50%, 13/866
## Chromosome Contig92931_98526: 2.15%, 17/791
## Chromosome Contig98527_104152: 2.07%, 16/773
## 949 markers are selected in total.
snpset.juvenile.id.4Colony <- unlist(unname(snpset.juvenile.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-juvenile.gds",
                    dest.fn="../qc-processing/colony/genotypes-juvenile-4Colony.gds",
                    snp.id=snpset.juvenile.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 16 samples and 949 SNPs
##     write sample.id
##     write snp.id
##     write snp.rs.id
##     write snp.position
##     write snp.chromosome
##     write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Recode genotypes for Colony

# Extract genotype matrix 
geno.matrix4Colony.juvenile <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-juvenile-4Colony.gds", with.id=TRUE)
## Genotype matrix: 16 samples X 949 SNPs
paste("No. of juvenile SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.juvenile$genotype), sep="") 
## [1] "No. of juvenile SNPs for Colony sibship analysis: 949"
paste("No. of juvenile samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.juvenile$genotype), sep="") 
## [1] "No. of juvenile samples for Colony sibship analysis: 16"
# Replace SNPRelate numeric allele coding with character allele coding 
a <- geno.matrix4Colony.juvenile$genotype %>% as_data_frame() %>% 
  mutate_all(funs(str_replace(., "0", "BB"))) %>%
  mutate_all(funs(str_replace(., "1", "AB"))) %>%
  mutate_all(funs(str_replace(., "2", "AA")))

# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info. 
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE 
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2. 
c <- b  #duplicate dataframe 
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])

for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)  
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)  
}

# Now replace the character allele information with Colony's way of numeric coding. 
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele 
# 2= A allele 
# 0= missing allele 
(geno.matrix4Colony.juvenile.recode <- c %>% 
  mutate_all(funs(str_replace(., "B", "1"))) %>%
  mutate_all(funs(str_replace(., "A", "2"))) %>%
  mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 16 x 1,898
##    v1    v1_2  v2    v2_2  v3    v3_2  v4    v4_2  v5    v5_2  v6    v6_2  v7   
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 2     1     2     1     2     2     2     1     1     1     2     1     2    
##  2 2     1     1     1     1     1     2     2     1     1     2     2     2    
##  3 1     1     2     2     2     2     2     1     2     1     1     1     2    
##  4 2     1     2     1     1     1     2     1     2     1     2     1     2    
##  5 2     2     2     2     1     1     2     2     2     2     1     1     1    
##  6 2     1     2     2     2     2     2     2     2     1     2     2     2    
##  7 2     2     2     1     2     1     2     1     2     1     2     1     2    
##  8 2     1     2     1     2     1     2     1     2     2     2     1     1    
##  9 1     1     2     2     2     2     2     1     2     1     2     1     2    
## 10 2     2     1     1     2     2     2     1     2     1     2     2     2    
## 11 2     1     1     1     2     1     2     1     2     1     2     2     2    
## 12 2     1     1     1     2     2     2     1     1     1     2     1     2    
## 13 2     1     2     1     2     1     2     1     2     2     2     1     1    
## 14 2     2     2     1     2     1     2     1     2     2     2     1     1    
## 15 2     1     2     2     1     1     2     1     2     2     2     1     1    
## 16 2     2     2     1     2     1     1     1     2     1     2     1     1    
## # … with 1,885 more variables: v7_2 <chr>, v8 <chr>, v8_2 <chr>, v9 <chr>,
## #   v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>, v11_2 <chr>, v12 <chr>,
## #   v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>, v14_2 <chr>, v15 <chr>,
## #   v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>, v17_2 <chr>, v18 <chr>,
## #   v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>, v20_2 <chr>, v21 <chr>,
## #   v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>, v23_2 <chr>, v24 <chr>,
## #   v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>, v26_2 <chr>, v27 <chr>,
## #   v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>, v29_2 <chr>, v30 <chr>,
## #   v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>, v32_2 <chr>, v33 <chr>,
## #   v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>, v35_2 <chr>, v36 <chr>,
## #   v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>, v38_2 <chr>, v39 <chr>,
## #   v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>, v41_2 <chr>, v42 <chr>,
## #   v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>, v44_2 <chr>, v45 <chr>,
## #   v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>, v47_2 <chr>, v48 <chr>,
## #   v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>, v50_2 <chr>, v51 <chr>,
## #   v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>, v53_2 <chr>, v54 <chr>,
## #   v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>, v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number 
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.juvenile$sample.id, sep=""),
                  geno.matrix4Colony.juvenile.recode),
            file="../qc-processing/colony/geno.matrix.juvenile.tab", delim="   ", col_names=FALSE)

Create Colony input file

I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.juvenile.tab” file and pasted it into the text file.

Here’s a snapshot of the input file, truncating the genotype info (there are 16 samples, but here I only show 1)

head -n 28 ../qc-processing/colony/colony2_juveniles.dat
tail -n 12 ../qc-processing/colony/colony2_juveniles.dat
## 'juvenile_snps'  !Dataset name
## 'juvenile_colony'  !Output file name
## 16         ! Number of offspring in the sample
## 950        ! Number of loci
## 100        ! Seed for random number generator
## 0          ! 0/1=Not updating/updating allele frequency
## 2          ! 2/1=Dioecious/Monoecious species
## 0          ! 0/1=No inbreeding/inbreeding
## 0          ! 0/1=Diploid species/HaploDiploid species
## 0  0       ! 0/1=Polygamy/Monogamy for males & females
## 0          ! 0/1=Clone inference =No/Yes
## 0          ! 0/1=Full sibship size scaling =No/Yes
## 0          ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0          ! 0/1=Unknown/Known population allele frequency
## 1         ! Number of runs
## 2         ! Length of run
## 0         ! 0/1=Monitor method by Iterate#/Time in second
## 100000    ! Monitor interval in Iterate# / in seconds
## 0         ! non-Windows version
## 1         ! Fulllikelihood
## 3         ! 1/2/3=low/medium/high Precision for Fulllikelihood
## 
## V@        !Marker names
## 0@        !Marker types, 0/1 = codominant/dominant
## 0.000@    !Allelic dropout rate  
## 0.0001@   !false allele rate
## 
## O137 2 1 2 1 2 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 2 1 2 2 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 2 2 2 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 1 1 2 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 2 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 2 2 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 1 1 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1
## 
## 
## 0  0     !prob. of dad/mum included in the candidates
## 0  0       !numbers of candiadte males & females
## 0  0          !#known fater-offspring dyads, paternity exclusion threshold
## 0  0          !#known moter-offspring dyads, maternity exclusion threshold
## 0             !#known paternal sibship with unknown fathers
## 0             !#known maternal sibship with unknown mothers
## 0             !#known paternity exclusions
## 0             !#known maternity exclusions
## 0             !#known paternal sibship exclusions
## 0             !#known maternal sibship exclusions

Run Colony analysis on juvenile samples

Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script

cd ../qc-processing/colony/
./colony2s.out IFN:colony2_juveniles.dat
cat ../qc-processing/colony/juvenile_colony.BestCluster
## ClusterIndex Probability          OffspringID             FatherID             MotherID
##            1      0.4954                 O137                   *1                   #1
##            1      0.4954                 O139                   *2                   #2
##            1      0.4954                 O140                   *2                   #1
##            1      0.4954                 O141                   *1                   #2
##            2      0.3684                 O156                   *3                   #3
##            2      0.3684                 O183                  *10                   #7
##            2      0.3684                 O184                  *10                   #8
##            2      0.3684                 O185                   *3                   #7
##            3      0.8991                 O159                   *4                   #4
##            3      0.8991                 O161                   *5                   #4
##            3      0.8991                 O162                   *5                   #4
##            4      0.8583                 O168                   *6                   #5
##            4      0.8583                 O169                   *7                   #5
##            4      0.8583                 O171                   *7                   #5
##            4      0.8583                 O172                   *8                   #5
##            5      1.0000                 O181                   *9                   #6
(colony.clusters.juvenile <- read_table("../qc-processing/colony/juvenile_colony.BestCluster", col_names=TRUE) %>% clean_names() %>% 
  mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
  mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
  mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.juvenile, by = c("offspring_id"="sample")))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ClusterIndex = col_double(),
##   Probability = col_double(),
##   OffspringID = col_character(),
##   FatherID = col_character(),
##   MotherID = col_character()
## )
##    cluster_index probability offspring_id father_id mother_id    stage
## 1              1      0.4954          137         1         1 juvenile
## 2              1      0.4954          139         2         2 juvenile
## 3              1      0.4954          140         2         1 juvenile
## 4              1      0.4954          141         1         2 juvenile
## 5              2      0.3684          156         3         3 juvenile
## 6              2      0.3684          183        10         7 juvenile
## 7              2      0.3684          184        10         8 juvenile
## 8              2      0.3684          185         3         7 juvenile
## 9              3      0.8991          159         4         4 juvenile
## 10             3      0.8991          161         5         4 juvenile
## 11             3      0.8991          162         5         4 juvenile
## 12             4      0.8583          168         6         5 juvenile
## 13             4      0.8583          169         7         5 juvenile
## 14             4      0.8583          171         7         5 juvenile
## 15             4      0.8583          172         8         5 juvenile
## 16             5      1.0000          181         9         6 juvenile
##     population pCO2.parent            pop_code
## 1    Dabob Bay     Ambient   Dabob Bay-Ambient
## 2    Dabob Bay     Ambient   Dabob Bay-Ambient
## 3    Dabob Bay     Ambient   Dabob Bay-Ambient
## 4    Dabob Bay     Ambient   Dabob Bay-Ambient
## 5  Fidalgo Bay     Ambient Fidalgo Bay-Ambient
## 6  Fidalgo Bay        High    Fidalgo Bay-High
## 7  Fidalgo Bay        High    Fidalgo Bay-High
## 8  Fidalgo Bay        High    Fidalgo Bay-High
## 9  Fidalgo Bay     Ambient Fidalgo Bay-Ambient
## 10 Fidalgo Bay     Ambient Fidalgo Bay-Ambient
## 11 Fidalgo Bay     Ambient Fidalgo Bay-Ambient
## 12   Dabob Bay        High      Dabob Bay-High
## 13   Dabob Bay        High      Dabob Bay-High
## 14   Dabob Bay        High      Dabob Bay-High
## 15   Dabob Bay        High      Dabob Bay-High
## 16 Fidalgo Bay        High    Fidalgo Bay-High
save(colony.clusters.juvenile, file = "../qc-processing/colony/best-cluster-juvenile")  

# Plot mother ~ father ID 
ggplotly(
  ggplot(colony.clusters.juvenile, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have? 
colony.clusters.juvenile %>%
  group_by(population, pCO2.parent) %>%
  summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
  pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>% 
  mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
  ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
  scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") + 
  facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.